aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2017-02-22 18:01:07 -0800
committerFrank Barchard <fbarchard@google.com>2017-03-06 09:54:15 -0800
commitb83bb38f0a92bedeb52baa31e515220927ef53bb (patch)
treea31c9da19db3f909cad22293ad2964d1c41c953a
parent04676c9f110180a5ae1fa259a38fab17101c6b5b (diff)
downloadlibyuv-b83bb38f0a92bedeb52baa31e515220927ef53bb.tar.gz
libyuv r1645 to fix android build warnings
r1602 under android.mk had unused parameter build warnings. The warnings were disabled. This CL fixes the source and re-enables the warning. Bug: 35099807 Test: mm for libyuv builds cleanly. Change-Id: If6b344ca39b2c321e277421cdeb817a5b1cc2514
-rw-r--r--README.version2
-rw-r--r--files/Android.mk35
-rw-r--r--files/BUILD.gn225
-rw-r--r--files/CM_linux_packages.cmake69
-rw-r--r--files/CMakeLists.txt157
-rw-r--r--files/DEPS445
-rw-r--r--files/OWNERS10
-rwxr-xr-xfiles/PRESUBMIT.py113
-rw-r--r--files/README.chromium2
-rw-r--r--files/build_overrides/build.gni31
-rw-r--r--files/build_overrides/gtest.gni19
-rwxr-xr-xfiles/cleanup_links.py107
-rw-r--r--files/codereview.settings12
-rw-r--r--files/docs/deprecated_builds.md440
-rw-r--r--files/docs/formats.md15
-rw-r--r--files/docs/getting_started.md352
-rw-r--r--files/gyp_libyuv.py2
-rw-r--r--files/include/libyuv.h4
-rw-r--r--files/include/libyuv/basic_types.h49
-rw-r--r--files/include/libyuv/compare.h76
-rw-r--r--files/include/libyuv/compare_row.h12
-rw-r--r--files/include/libyuv/convert.h373
-rw-r--r--files/include/libyuv/convert_argb.h437
-rw-r--r--files/include/libyuv/convert_from.h298
-rw-r--r--files/include/libyuv/convert_from_argb.h259
-rw-r--r--files/include/libyuv/cpu_id.h11
-rw-r--r--files/include/libyuv/macros_msa.h233
-rw-r--r--files/include/libyuv/mjpeg_decoder.h15
-rw-r--r--files/include/libyuv/planar_functions.h743
-rw-r--r--files/include/libyuv/rotate.h143
-rw-r--r--files/include/libyuv/rotate_argb.h14
-rw-r--r--files/include/libyuv/rotate_row.h223
-rw-r--r--files/include/libyuv/row.h2278
-rw-r--r--files/include/libyuv/scale.h113
-rw-r--r--files/include/libyuv/scale_argb.h56
-rw-r--r--files/include/libyuv/scale_row.h998
-rw-r--r--files/include/libyuv/version.h6
-rw-r--r--files/include/libyuv/video_common.h28
-rw-r--r--files/infra/config/OWNERS3
-rw-r--r--files/infra/config/README.md1
-rw-r--r--files/infra/config/cq.cfg61
-rw-r--r--files/libyuv.gni20
-rw-r--r--files/libyuv.gyp20
-rw-r--r--files/libyuv.gypi10
-rw-r--r--files/libyuv_test.gyp57
-rw-r--r--files/linux.mk8
-rw-r--r--files/pylintrc17
-rw-r--r--files/source/compare.cc140
-rw-r--r--files/source/compare_gcc.cc35
-rw-r--r--files/source/compare_win.cc98
-rw-r--r--files/source/convert.cc1349
-rw-r--r--files/source/convert_argb.cc1132
-rw-r--r--files/source/convert_from.cc919
-rw-r--r--files/source/convert_from_argb.cc686
-rw-r--r--files/source/convert_jpeg.cc181
-rw-r--r--files/source/convert_to_argb.cc180
-rw-r--r--files/source/convert_to_i420.cc208
-rw-r--r--files/source/cpu_id.cc111
-rw-r--r--files/source/mjpeg_decoder.cc121
-rw-r--r--files/source/mjpeg_validate.cc3
-rw-r--r--files/source/planar_functions.cc1506
-rw-r--r--files/source/rotate.cc359
-rw-r--r--files/source/rotate_any.cc54
-rw-r--r--files/source/rotate_argb.cc101
-rw-r--r--files/source/rotate_common.cc38
-rw-r--r--files/source/rotate_dspr2.cc (renamed from files/source/rotate_mips.cc)307
-rw-r--r--files/source/rotate_gcc.cc658
-rw-r--r--files/source/rotate_msa.cc250
-rw-r--r--files/source/rotate_neon.cc23
-rw-r--r--files/source/rotate_neon64.cc32
-rw-r--r--files/source/rotate_win.cc42
-rw-r--r--files/source/row_any.cc755
-rw-r--r--files/source/row_common.cc1108
-rw-r--r--files/source/row_dspr2.cc1721
-rw-r--r--files/source/row_gcc.cc1833
-rw-r--r--files/source/row_mips.cc782
-rw-r--r--files/source/row_msa.cc2977
-rw-r--r--files/source/row_neon.cc668
-rw-r--r--files/source/row_neon64.cc707
-rw-r--r--files/source/row_win.cc3708
-rw-r--r--files/source/scale.cc780
-rw-r--r--files/source/scale_any.cc463
-rw-r--r--files/source/scale_argb.cc483
-rw-r--r--files/source/scale_common.cc613
-rw-r--r--files/source/scale_dspr2.cc (renamed from files/source/scale_mips.cc)520
-rw-r--r--files/source/scale_gcc.cc435
-rw-r--r--files/source/scale_msa.cc553
-rw-r--r--files/source/scale_neon.cc178
-rw-r--r--files/source/scale_neon64.cc228
-rw-r--r--files/source/scale_win.cc848
-rw-r--r--files/source/video_common.cc38
-rw-r--r--files/tools_libyuv/OWNERS1
-rwxr-xr-xfiles/tools_libyuv/autoroller/roll_deps.py482
-rw-r--r--files/tools_libyuv/autoroller/unittests/.DS_Storebin0 -> 6148 bytes
-rwxr-xr-xfiles/tools_libyuv/autoroller/unittests/roll_deps_test.py143
-rw-r--r--files/tools_libyuv/autoroller/unittests/testdata/DEPS20
-rw-r--r--files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new13
-rw-r--r--files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old13
-rwxr-xr-xfiles/tools_libyuv/get_landmines.py50
-rw-r--r--files/tools_libyuv/msan/OWNERS3
-rw-r--r--files/tools_libyuv/msan/blacklist.txt9
-rw-r--r--files/tools_libyuv/ubsan/OWNERS4
-rw-r--r--files/tools_libyuv/ubsan/blacklist.txt15
-rw-r--r--files/tools_libyuv/ubsan/vptr_blacklist.txt21
-rw-r--r--files/tools_libyuv/valgrind/libyuv_tests.bat79
-rwxr-xr-xfiles/tools_libyuv/valgrind/libyuv_tests.py139
-rwxr-xr-xfiles/tools_libyuv/valgrind/libyuv_tests.sh101
-rw-r--r--files/tools_libyuv/valgrind/memcheck/OWNERS1
-rw-r--r--files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py99
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions.txt5
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt5
-rw-r--r--files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt5
-rw-r--r--files/unit_test/color_test.cc331
-rw-r--r--files/unit_test/compare_test.cc111
-rw-r--r--files/unit_test/convert_test.cc2682
-rw-r--r--files/unit_test/cpu_test.cc25
-rw-r--r--files/unit_test/math_test.cc2
-rw-r--r--files/unit_test/planar_test.cc1072
-rw-r--r--files/unit_test/rotate_argb_test.cc116
-rw-r--r--files/unit_test/rotate_test.cc173
-rw-r--r--files/unit_test/scale_argb_test.cc301
-rw-r--r--files/unit_test/scale_test.cc275
-rw-r--r--files/unit_test/unit_test.cc180
-rw-r--r--files/unit_test/unit_test.h83
-rw-r--r--files/unit_test/video_common_test.cc50
-rw-r--r--files/util/compare.cc10
-rw-r--r--files/util/convert.cc141
-rw-r--r--files/util/cpuid.c2
-rw-r--r--files/util/psnr.cc232
-rw-r--r--files/util/psnr_main.cc238
-rw-r--r--files/util/ssim.cc163
-rw-r--r--files/util/ssim.h10
132 files changed, 28557 insertions, 15381 deletions
diff --git a/README.version b/README.version
index af9d7c6e..6d6e0d4b 100644
--- a/README.version
+++ b/README.version
@@ -1,3 +1,3 @@
-Version: r1602
+Version: r1645
BugComponent: 42195
Owner: lajos
diff --git a/files/Android.mk b/files/Android.mk
index 217114ec..cc17bde2 100644
--- a/files/Android.mk
+++ b/files/Android.mk
@@ -1,4 +1,4 @@
-# This is the Android makefile for libyuv for both platform and NDK.
+# This is the Android makefile for libyuv for NDK.
LOCAL_PATH:= $(call my-dir)
include $(CLEAR_VARS)
@@ -8,45 +8,48 @@ LOCAL_CPP_EXTENSION := .cc
LOCAL_SRC_FILES := \
source/compare.cc \
source/compare_common.cc \
- source/compare_neon64.cc \
source/compare_gcc.cc \
+ source/compare_neon.cc \
+ source/compare_neon64.cc \
source/convert.cc \
source/convert_argb.cc \
source/convert_from.cc \
source/convert_from_argb.cc \
+ source/convert_jpeg.cc \
source/convert_to_argb.cc \
source/convert_to_i420.cc \
source/cpu_id.cc \
+ source/mjpeg_decoder.cc \
+ source/mjpeg_validate.cc \
source/planar_functions.cc \
source/rotate.cc \
source/rotate_any.cc \
source/rotate_argb.cc \
source/rotate_common.cc \
- source/rotate_mips.cc \
- source/rotate_neon64.cc \
+ source/rotate_dspr2.cc \
source/rotate_gcc.cc \
+ source/rotate_msa.cc \
+ source/rotate_neon.cc \
+ source/rotate_neon64.cc \
source/row_any.cc \
source/row_common.cc \
- source/row_mips.cc \
+ source/row_dspr2.cc \
+ source/row_gcc.cc \
+ source/row_msa.cc \
+ source/row_neon.cc \
source/row_neon64.cc \
- source/row_gcc.cc \
source/scale.cc \
source/scale_any.cc \
source/scale_argb.cc \
source/scale_common.cc \
- source/scale_mips.cc \
- source/scale_neon64.cc \
+ source/scale_dspr2.cc \
source/scale_gcc.cc \
- source/video_common.cc \
- source/compare_neon.cc \
- source/rotate_neon.cc \
- source/row_neon.cc \
+ source/scale_msa.cc \
source/scale_neon.cc \
- source/mjpeg_decoder.cc \
- source/convert_jpeg.cc \
- source/mjpeg_validate.cc
+ source/scale_neon64.cc \
+ source/video_common.cc
-common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG -Wno-unused-parameter
+common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG
LOCAL_CFLAGS += $(common_CFLAGS)
LOCAL_SHARED_LIBRARIES := libjpeg
LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
diff --git a/files/BUILD.gn b/files/BUILD.gn
index b091cbc2..57771b72 100644
--- a/files/BUILD.gn
+++ b/files/BUILD.gn
@@ -6,19 +6,37 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import("//build/config/arm.gni")
-import("//build/config/sanitizers/sanitizers.gni")
+import("libyuv.gni")
+import("//testing/test.gni")
config("libyuv_config") {
- include_dirs = [
- ".",
- "include",
- ]
+ include_dirs = [ "include" ]
+ if (is_android && current_cpu=="arm64") {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+ }
+ if (is_android && current_cpu != "arm64") {
+ ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+ }
}
-use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+# This target is built when no specific target is specified on the command line.
+group("default") {
+ testonly = true
+ deps = [
+ ":libyuv",
+ ]
+ if (libyuv_include_tests) {
+ deps += [
+ ":compare",
+ ":convert",
+ ":cpuid",
+ ":libyuv_unittest",
+ ":psnr",
+ ]
+ }
+}
-source_set("libyuv") {
+static_library("libyuv") {
sources = [
# Headers
"include/libyuv.h",
@@ -61,57 +79,56 @@ source_set("libyuv") {
"source/rotate_any.cc",
"source/rotate_argb.cc",
"source/rotate_common.cc",
- "source/rotate_mips.cc",
+ "source/rotate_dspr2.cc",
"source/rotate_gcc.cc",
"source/rotate_win.cc",
"source/row_any.cc",
"source/row_common.cc",
- "source/row_mips.cc",
+ "source/row_dspr2.cc",
"source/row_gcc.cc",
"source/row_win.cc",
"source/scale.cc",
"source/scale_any.cc",
"source/scale_argb.cc",
"source/scale_common.cc",
- "source/scale_mips.cc",
+ "source/scale_dspr2.cc",
"source/scale_gcc.cc",
"source/scale_win.cc",
"source/video_common.cc",
]
- configs -= [ "//build/config/compiler:chromium_code" ]
- configs += [ "//build/config/compiler:no_chromium_code" ]
-
public_configs = [ ":libyuv_config" ]
defines = []
+ deps = []
if (!is_ios) {
defines += [ "HAVE_JPEG" ]
+ deps += [ "//third_party:jpeg" ]
}
- if (is_msan) {
- # MemorySanitizer does not support assembly code yet.
- # http://crbug.com/344505
- defines += [ "LIBYUV_DISABLE_X86" ]
+ if (libyuv_use_neon) {
+ deps += [ ":libyuv_neon" ]
}
- deps = [
- "//third_party:jpeg",
- ]
-
- if (use_neon) {
- deps += [ ":libyuv_neon" ]
+ if (libyuv_use_msa) {
+ deps += [ ":libyuv_msa" ]
}
- if (is_nacl) {
- # Always enable optimization under NaCl to workaround crbug.com/538243 .
+ # Always enable optimization for Release and NaCl builds (to workaround
+ # crbug.com/538243).
+ if (!is_debug || is_nacl) {
configs -= [ "//build/config/compiler:default_optimization" ]
+ # Enable optimize for speed (-O2) over size (-Os).
configs += [ "//build/config/compiler:optimize_max" ]
}
+
+ # To enable AVX2 or other cpu optimization, pass flag here
+ # cflags = [ "-mavx2" ]
+
}
-if (use_neon) {
+if (libyuv_use_neon) {
static_library("libyuv_neon") {
sources = [
# ARM Source Files
@@ -127,9 +144,163 @@ if (use_neon) {
public_configs = [ ":libyuv_config" ]
+ # Always enable optimization for Release and NaCl builds (to workaround
+ # crbug.com/538243).
+ if (!is_debug) {
+ configs -= [ "//build/config/compiler:default_optimization" ]
+ # Enable optimize for speed (-O2) over size (-Os).
+ configs += [ "//build/config/compiler:optimize_max" ]
+ }
+
if (current_cpu != "arm64") {
configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
cflags = [ "-mfpu=neon" ]
}
}
}
+
+if (libyuv_use_msa) {
+ static_library("libyuv_msa") {
+ sources = [
+ # MSA Source Files
+ "source/row_msa.cc",
+ "source/scale_msa.cc",
+ "source/rotate_msa.cc",
+ ]
+
+ public_configs = [ ":libyuv_config" ]
+ }
+}
+
+if (libyuv_include_tests) {
+ config("libyuv_unittest_warnings_config") {
+ if (!is_win) {
+ cflags = [
+ # TODO(fbarchard): Fix sign and unused variable warnings.
+ "-Wno-sign-compare",
+ "-Wno-unused-variable"
+ ]
+ }
+ if (is_win) {
+ cflags = [
+ "/wd4245", # signed/unsigned mismatch
+ "/wd4189", # local variable is initialized but not referenced
+ ]
+ }
+ }
+ config("libyuv_unittest_config") {
+ defines = [ "GTEST_RELATIVE_PATH" ]
+ }
+
+ test("libyuv_unittest") {
+ testonly = true
+
+ sources = [
+ # headers
+ "unit_test/unit_test.h",
+ # sources
+ "unit_test/basictypes_test.cc",
+ "unit_test/compare_test.cc",
+ "unit_test/color_test.cc",
+ "unit_test/convert_test.cc",
+ "unit_test/cpu_test.cc",
+ "unit_test/math_test.cc",
+ "unit_test/planar_test.cc",
+ "unit_test/rotate_argb_test.cc",
+ "unit_test/rotate_test.cc",
+ "unit_test/scale_argb_test.cc",
+ "unit_test/scale_test.cc",
+ "unit_test/unit_test.cc",
+ "unit_test/video_common_test.cc",
+ ]
+
+ deps = [
+ ":libyuv",
+ "//testing/gtest",
+ "//third_party/gflags",
+ ]
+
+ configs += [ ":libyuv_unittest_warnings_config" ]
+
+ public_deps = [ "//testing/gtest" ]
+ public_configs = [ ":libyuv_unittest_config" ]
+
+ defines = []
+
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ if (is_ios) {
+ configs -= [ "//build/config/compiler:default_symbols" ]
+ configs += [ "//build/config/compiler:symbols" ]
+ cflags = [ "-Wno-sometimes-uninitialized" ]
+ }
+ if (!is_ios && !libyuv_disable_jpeg) {
+ defines += [ "HAVE_JPEG" ]
+ }
+ if (is_android) {
+ deps += [ "//testing/android/native_test:native_test_native_code" ]
+ }
+
+ # TODO(YangZhang): These lines can be removed when high accuracy
+ # YUV to RGB to Neon is ported.
+ if ((target_cpu=="armv7" || target_cpu=="armv7s" ||
+ (target_cpu=="arm" && arm_version >= 7) || target_cpu=="arm64") &&
+ (arm_use_neon || arm_optionally_use_neon)) {
+ defines += [ "LIBYUV_NEON" ]
+ }
+
+ defines += [
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # "LIBYUV_DISABLE_X86",
+ # "LIBYUV_DISABLE_NEON",
+ # "LIBYUV_DISABLE_DSPR2",
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # "LIBYUV_USING_SHARED_LIBRARY"
+ ]
+ }
+
+ executable("compare") {
+ sources = [
+ # sources
+ "util/compare.cc"
+ ]
+ deps = [ ":libyuv" ]
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ }
+
+ executable("convert") {
+ sources = [
+ # sources
+ "util/convert.cc"
+ ]
+ deps = [ ":libyuv" ]
+ if (is_linux) {
+ cflags = [ "-fexceptions" ]
+ }
+ }
+
+ executable("psnr") {
+ sources = [
+ # sources
+ "util/psnr_main.cc",
+ "util/psnr.cc",
+ "util/ssim.cc"
+ ]
+ deps = [ ":libyuv" ]
+
+ if (!is_ios && !libyuv_disable_jpeg) {
+ defines = [ "HAVE_JPEG" ]
+ }
+ }
+
+ executable("cpuid") {
+ sources = [
+ # sources
+ "util/cpuid.c"
+ ]
+ deps = [ ":libyuv" ]
+ }
+}
diff --git a/files/CM_linux_packages.cmake b/files/CM_linux_packages.cmake
new file mode 100644
index 00000000..5f676f89
--- /dev/null
+++ b/files/CM_linux_packages.cmake
@@ -0,0 +1,69 @@
+# determine the version number from the #define in libyuv/version.h
+EXECUTE_PROCESS (
+ COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h
+ WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+ OUTPUT_VARIABLE YUV_VERSION_NUMBER
+ OUTPUT_STRIP_TRAILING_WHITESPACE )
+SET ( YUV_VER_MAJOR 0 )
+SET ( YUV_VER_MINOR 0 )
+SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
+SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
+MESSAGE ( "Building ver.: ${YUV_VERSION}" )
+
+# is this a 32-bit or 64-bit build?
+IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+ SET ( YUV_BIT_SIZE 64 )
+ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 )
+ SET ( YUV_BIT_SIZE 32 )
+ELSE ()
+ MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" )
+ENDIF ()
+
+# detect if this is a ARM build
+STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos)
+IF ( ${pos} EQUAL -1 )
+ SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE )
+ELSE ()
+ MESSAGE ( "Cross compiling for ARM7" )
+ SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE )
+ENDIF ()
+STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos)
+IF ( ${pos} EQUAL -1 )
+ SET ( YUV_COMPILE_FOR_ARM7 FALSE )
+ELSE ()
+ MESSAGE ( "Compiling for ARM" )
+ SET ( YUV_COMPILE_FOR_ARM7 TRUE )
+ENDIF ()
+
+# setup the sytem name, such as "x86-32", "amd-64", and "arm-32
+IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} )
+ SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" )
+ELSE ()
+ IF ( YUV_BIT_SIZE EQUAL 32 )
+ SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" )
+ ELSE ()
+ SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
+ ENDIF ()
+ENDIF ()
+MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
+
+# define all the variables needed by CPack to create .deb and .rpm packages
+SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" )
+SET ( CPACK_PACKAGE_CONTACT "fbarchard@chromium.org" )
+SET ( CPACK_PACKAGE_VERSION ${YUV_VERSION} )
+SET ( CPACK_PACKAGE_VERSION_MAJOR ${YUV_VER_MAJOR} )
+SET ( CPACK_PACKAGE_VERSION_MINOR ${YUV_VER_MINOR} )
+SET ( CPACK_PACKAGE_VERSION_PATCH ${YUV_VER_PATCH} )
+SET ( CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/LICENSE )
+SET ( CPACK_SYSTEM_NAME "linux-${YUV_SYSTEM_NAME}" )
+SET ( CPACK_PACKAGE_NAME "libyuv" )
+SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "YUV library" )
+SET ( CPACK_PACKAGE_DESCRIPTION "YUV library and YUV conversion tool" )
+SET ( CPACK_DEBIAN_PACKAGE_SECTION "other" )
+SET ( CPACK_DEBIAN_PACKAGE_PRIORITY "optional" )
+SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER "Frank Barchard <fbarchard@chromium.org>" )
+SET ( CPACK_GENERATOR "DEB;RPM" )
+
+# create the .deb and .rpm files (you'll need build-essential and rpm tools)
+INCLUDE( CPack )
+
diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt
index 718b47ad..7c95487f 100644
--- a/files/CMakeLists.txt
+++ b/files/CMakeLists.txt
@@ -1,110 +1,45 @@
-cmake_minimum_required(VERSION 2.8)
-
# CMakeLists for libyuv
# Originally created for "roxlu build system" to compile libyuv on windows
# Run with -DTEST=ON to build unit tests
-option(TEST "Built unit tests" OFF)
-
-set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
-set(ly_src_dir ${ly_base_dir}/source/)
-set(ly_inc_dir ${ly_base_dir}/include)
-set(ly_lib_name "yuv")
-
-set(ly_source_files
- ${ly_src_dir}/compare.cc
- ${ly_src_dir}/compare_common.cc
- ${ly_src_dir}/compare_neon.cc
- ${ly_src_dir}/compare_neon64.cc
- ${ly_src_dir}/compare_gcc.cc
- ${ly_src_dir}/compare_win.cc
- ${ly_src_dir}/convert.cc
- ${ly_src_dir}/convert_argb.cc
- ${ly_src_dir}/convert_from.cc
- ${ly_src_dir}/convert_from_argb.cc
- ${ly_src_dir}/convert_jpeg.cc
- ${ly_src_dir}/convert_to_argb.cc
- ${ly_src_dir}/convert_to_i420.cc
- ${ly_src_dir}/cpu_id.cc
- ${ly_src_dir}/mjpeg_decoder.cc
- ${ly_src_dir}/mjpeg_validate.cc
- ${ly_src_dir}/planar_functions.cc
- ${ly_src_dir}/rotate.cc
- ${ly_src_dir}/rotate_any.cc
- ${ly_src_dir}/rotate_argb.cc
- ${ly_src_dir}/rotate_common.cc
- ${ly_src_dir}/rotate_mips.cc
- ${ly_src_dir}/rotate_neon.cc
- ${ly_src_dir}/rotate_neon64.cc
- ${ly_src_dir}/rotate_gcc.cc
- ${ly_src_dir}/rotate_win.cc
- ${ly_src_dir}/row_any.cc
- ${ly_src_dir}/row_common.cc
- ${ly_src_dir}/row_mips.cc
- ${ly_src_dir}/row_neon.cc
- ${ly_src_dir}/row_neon64.cc
- ${ly_src_dir}/row_gcc.cc
- ${ly_src_dir}/row_win.cc
- ${ly_src_dir}/scale.cc
- ${ly_src_dir}/scale_any.cc
- ${ly_src_dir}/scale_argb.cc
- ${ly_src_dir}/scale_common.cc
- ${ly_src_dir}/scale_mips.cc
- ${ly_src_dir}/scale_neon.cc
- ${ly_src_dir}/scale_neon64.cc
- ${ly_src_dir}/scale_gcc.cc
- ${ly_src_dir}/scale_win.cc
- ${ly_src_dir}/video_common.cc
-)
-
-set(ly_unittest_sources
- ${ly_base_dir}/unit_test/basictypes_test.cc
- ${ly_base_dir}/unit_test/color_test.cc
- ${ly_base_dir}/unit_test/compare_test.cc
- ${ly_base_dir}/unit_test/convert_test.cc
- ${ly_base_dir}/unit_test/cpu_test.cc
- ${ly_base_dir}/unit_test/math_test.cc
- ${ly_base_dir}/unit_test/planar_test.cc
- ${ly_base_dir}/unit_test/rotate_argb_test.cc
- ${ly_base_dir}/unit_test/rotate_test.cc
- ${ly_base_dir}/unit_test/scale_argb_test.cc
- ${ly_base_dir}/unit_test/scale_test.cc
- ${ly_base_dir}/unit_test/unit_test.cc
- ${ly_base_dir}/unit_test/video_common_test.cc
-)
-
-set(ly_header_files
- ${ly_inc_dir}/libyuv/basic_types.h
- ${ly_inc_dir}/libyuv/compare.h
- ${ly_inc_dir}/libyuv/convert.h
- ${ly_inc_dir}/libyuv/convert_argb.h
- ${ly_inc_dir}/libyuv/convert_from.h
- ${ly_inc_dir}/libyuv/convert_from_argb.h
- ${ly_inc_dir}/libyuv/cpu_id.h
- ${ly_inc_dir}/libyuv/planar_functions.h
- ${ly_inc_dir}/libyuv/rotate.h
- ${ly_inc_dir}/libyuv/rotate_argb.h
- ${ly_inc_dir}/libyuv/rotate_row.h
- ${ly_inc_dir}/libyuv/row.h
- ${ly_inc_dir}/libyuv/scale.h
- ${ly_inc_dir}/libyuv/scale_argb.h
- ${ly_inc_dir}/libyuv/scale_row.h
- ${ly_inc_dir}/libyuv/version.h
- ${ly_inc_dir}/libyuv/video_common.h
- ${ly_inc_dir}/libyuv/mjpeg_decoder.h
-)
-
-include_directories(${ly_inc_dir})
-
-add_library(${ly_lib_name} STATIC ${ly_source_files})
-
-add_executable(convert ${ly_base_dir}/util/convert.cc)
-target_link_libraries(convert ${ly_lib_name})
-
-include(FindJPEG)
+
+PROJECT ( YUV C CXX ) # "C" is required even for C++ projects
+CMAKE_MINIMUM_REQUIRED( VERSION 2.8 )
+OPTION( TEST "Built unit tests" OFF )
+
+SET ( ly_base_dir ${PROJECT_SOURCE_DIR} )
+SET ( ly_src_dir ${ly_base_dir}/source )
+SET ( ly_inc_dir ${ly_base_dir}/include )
+SET ( ly_tst_dir ${ly_base_dir}/unit_test )
+SET ( ly_lib_name yuv )
+SET ( ly_lib_static ${ly_lib_name} )
+SET ( ly_lib_shared ${ly_lib_name}_shared )
+
+FILE ( GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc )
+LIST ( SORT ly_source_files )
+
+FILE ( GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc )
+LIST ( SORT ly_unittest_sources )
+
+INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} )
+
+# this creates the static library (.a)
+ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} )
+
+# this creates the shared library (.so)
+ADD_LIBRARY ( ${ly_lib_shared} SHARED ${ly_source_files} )
+SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" )
+SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES PREFIX "lib" )
+
+# this creates the conversion tool
+ADD_EXECUTABLE ( convert ${ly_base_dir}/util/convert.cc )
+TARGET_LINK_LIBRARIES ( convert ${ly_lib_static} )
+
+
+INCLUDE ( FindJPEG )
if (JPEG_FOUND)
- include_directories(${JPEG_INCLUDE_DIR})
- target_link_libraries(convert ${JPEG_LIBRARY})
- add_definitions(-DHAVE_JPEG)
+ include_directories( ${JPEG_INCLUDE_DIR} )
+ target_link_libraries( convert ${JPEG_LIBRARY} )
+ add_definitions( -DHAVE_JPEG )
endif()
if(TEST)
@@ -128,15 +63,21 @@ if(TEST)
if (JPEG_FOUND)
target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
endif()
-
+
if(NACL AND NACL_LIBC STREQUAL "newlib")
target_link_libraries(libyuv_unittest glibc-compat)
endif()
target_link_libraries(libyuv_unittest gflags)
-
endif()
-install(TARGETS ${ly_lib_name} DESTINATION lib)
-install(FILES ${ly_header_files} DESTINATION include/libyuv)
-install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
+
+# install the conversion tool, .so, .a, and all the header files
+INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/convert DESTINATION bin RENAME yuvconvert )
+INSTALL ( TARGETS ${ly_lib_static} DESTINATION lib )
+INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib )
+INSTALL ( DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include )
+
+# create the .deb and .rpm packages using cpack
+INCLUDE ( CM_linux_packages.cmake )
+
diff --git a/files/DEPS b/files/DEPS
index 0a450050..10e529c9 100644
--- a/files/DEPS
+++ b/files/DEPS
@@ -2,41 +2,452 @@ vars = {
# Override root_dir in your .gclient's custom_vars to specify a custom root
# folder name.
'root_dir': 'libyuv',
- 'extra_gyp_flag': '-Dextra_gyp_flag=0',
'chromium_git': 'https://chromium.googlesource.com',
-
- # Roll the Chromium Git hash to pick up newer versions of all the
- # dependencies and tools linked to in setup_links.py.
- 'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0',
+ 'chromium_revision': '222a3fe7a738486a887bb53cffb8e3b52376f609',
+ 'swarming_revision': 'ebc8dab6f8b8d79ec221c94de39a921145abd404',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling lss
+ # and whatever else without interference from each other.
+ 'lss_revision': '3f6478ac95edf86cd3da300c2c0d34a438f5dbeb',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling catapult
+ # and whatever else without interference from each other.
+ 'catapult_revision': '4ee31ea3b497ffe08391e88a5434e0a340e48342',
}
-# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
-# https; the latter can cause problems for users behind proxies.
deps = {
+ Var('root_dir') + '/build':
+ Var('chromium_git') + '/chromium/src/build' + '@' + '47e07d6798693fd71c02e25097c97865b5271c40',
+ Var('root_dir') + '/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + 'a7cc7a3e21a061975b33dcdcd81a9716ba614c3c',
+ Var('root_dir') + '/testing':
+ Var('chromium_git') + '/chromium/src/testing' + '@' + '178a302b13e943c679f3bbeb0a7e511f7c318404',
+ Var('root_dir') + '/testing/gtest':
+ Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '6f8a66431cb592dad629028a50b3dd418a408c87',
+ Var('root_dir') + '/testing/gmock':
+ Var('chromium_git') + '/external/googlemock.git' + '@' + '0421b6f358139f02e102c9c332ce19a33faf75be', # from svn revision 566
+ Var('root_dir') + '/third_party':
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + '4f196478f68c139a5deec388fd1f426a9251b4b0',
+ Var('root_dir') + '/third_party/catapult':
+ Var('chromium_git') + '/external/github.com/catapult-project/catapult.git' + '@' + Var('catapult_revision'),
+ Var('root_dir') + '/third_party/colorama/src':
+ Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+ Var('root_dir') + '/third_party/libjpeg_turbo':
+ Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '7260e4d8b8e1e40b17f03fafdf1cd83296900f76',
+ Var('root_dir') + '/third_party/yasm/source/patched-yasm':
+ Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '7da28c6c7c6a1387217352ce02b31754deb54d2a',
+ Var('root_dir') + '/tools':
+ Var('chromium_git') + '/chromium/src/tools' + '@' + '54fd165044db88eca930ab9d20a6340b76136d91',
+ Var('root_dir') + '/tools/gyp':
+ Var('chromium_git') + '/external/gyp.git' + '@' + 'e7079f0e0e14108ab0dba58728ff219637458563',
+ Var('root_dir') + '/tools/swarming_client':
+ Var('chromium_git') + '/external/swarming.client.git' + '@' + Var('swarming_revision'),
+
+ # libyuv-only dependencies (not present in Chromium).
+ Var('root_dir') + '/third_party/gflags':
+ Var('chromium_git') + '/external/webrtc/deps/third_party/gflags' + '@' + '892576179b45861b53e04a112996a738309cf364',
Var('root_dir') + '/third_party/gflags/src':
- Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
+ Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca',
+ Var('root_dir') + '/third_party/gtest-parallel':
+ Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '8768563f5c580f8fc416a13c35c8f23b8a602821',
+}
+
+deps_os = {
+ 'android': {
+ Var('root_dir') + '/base':
+ Var('chromium_git') + '/chromium/src/base' + '@' + 'b9d4d9b0e5373bbdb5403c68d51e7385d78a09d0',
+ Var('root_dir') + '/third_party/android_tools':
+ Var('chromium_git') + '/android_tools.git' + '@' + 'b43a6a289a7588b1769814f04dd6c7d7176974cc',
+ Var('root_dir') + '/third_party/ced/src':
+ Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '368a9cc09ad868a3d28f0b5ad4a733f263c46409',
+ Var('root_dir') + '/third_party/icu':
+ Var('chromium_git') + '/chromium/deps/icu.git' + '@' + '9cd2828740572ba6f694b9365236a8356fd06147',
+ Var('root_dir') + '/third_party/jsr-305/src':
+ Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
+ Var('root_dir') + '/third_party/junit/src':
+ Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
+ Var('root_dir') + '/third_party/lss':
+ Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ Var('root_dir') + '/third_party/mockito/src':
+ Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850',
+ Var('root_dir') + '/third_party/requests/src':
+ Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
+ Var('root_dir') + '/third_party/robolectric/robolectric':
+ Var('chromium_git') + '/external/robolectric.git' + '@' + 'e38b49a12fdfa17a94f0382cc8ffaf69132fd09b',
+ },
+ 'ios': {
+ Var('root_dir') + '/ios':
+ Var('chromium_git') + '/chromium/src/ios' + '@' + '291daef6af7764f8475089c65808d52ee50b496e',
+ },
+ 'unix': {
+ Var('root_dir') + '/third_party/lss':
+ Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ },
+ 'win': {
+ # Dependencies used by libjpeg-turbo
+ Var('root_dir') + '/third_party/yasm/binaries':
+ Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
+ },
}
# Define rules for which include paths are allowed in our source.
include_rules = [ '+gflags' ]
+pre_deps_hooks = [
+ {
+ # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
+ # TODO(kjellander): Remove this in March 2017.
+ 'name': 'cleanup_links',
+ 'pattern': '.',
+ 'action': ['python', Var('root_dir') + '/cleanup_links.py'],
+ },
+]
+
hooks = [
{
- # Clone chromium and its deps.
- 'name': 'sync chromium',
+ # This clobbers when necessary (based on get_landmines.py). It should be
+ # an early hook but it will need to be run after syncing Chromium and
+ # setting up the links, so the script actually exists.
+ 'name': 'landmines',
+ 'pattern': '.',
+ 'action': [
+ 'python',
+ Var('root_dir') + '/build/landmines.py',
+ '--landmine-scripts',
+ Var('root_dir') + '/tools_libyuv/get_landmines.py',
+ '--src-dir',
+ Var('root_dir') + '',
+ ],
+ },
+ # Android dependencies. Many are downloaded using Google Storage these days.
+ # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
+ # such dependencies we share with Chromium.
+ {
+ # This downloads SDK extras and puts them in the
+ # third_party/android_tools/sdk/extras directory.
+ 'name': 'sdkextras',
+ 'pattern': '.',
+ # When adding a new sdk extras package to download, add the package
+ # directory and zip file to .gitignore in third_party/android_tools.
+ 'action': ['python',
+ Var('root_dir') + '/build/android/play_services/update.py',
+ 'download'
+ ],
+ },
+ {
+ 'name': 'intellij',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-intellij',
+ '-l', 'third_party/intellij'
+ ],
+ },
+ {
+ 'name': 'javax_inject',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-javax-inject',
+ '-l', 'third_party/javax_inject'
+ ],
+ },
+ {
+ 'name': 'hamcrest',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-hamcrest',
+ '-l', 'third_party/hamcrest'
+ ],
+ },
+ {
+ 'name': 'guava',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-guava',
+ '-l', 'third_party/guava'
+ ],
+ },
+ {
+ 'name': 'android_support_test_runner',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-android-support-test-runner',
+ '-l', 'third_party/android_support_test_runner'
+ ],
+ },
+ {
+ 'name': 'byte_buddy',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-byte-buddy',
+ '-l', 'third_party/byte_buddy'
+ ],
+ },
+ {
+ 'name': 'espresso',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-espresso',
+ '-l', 'third_party/espresso'
+ ],
+ },
+ {
+ 'name': 'robolectric_libs',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-robolectric',
+ '-l', 'third_party/robolectric'
+ ],
+ },
+ {
+ 'name': 'apache_velocity',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-apache-velocity',
+ '-l', 'third_party/apache_velocity'
+ ],
+ },
+ {
+ 'name': 'ow2_asm',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-ow2-asm',
+ '-l', 'third_party/ow2_asm'
+ ],
+ },
+ {
+ 'name': 'icu4j',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-icu4j',
+ '-l', 'third_party/icu4j'
+ ],
+ },
+ {
+ 'name': 'accessibility_test_framework',
'pattern': '.',
- 'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
- '--target-revision', Var('chromium_revision')],
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-accessibility-test-framework',
+ '-l', 'third_party/accessibility_test_framework'
+ ],
},
{
- # Create links to shared dependencies in Chromium.
- 'name': 'setup_links',
+ 'name': 'bouncycastle',
'pattern': '.',
- 'action': ['python', Var('root_dir') + '/setup_links.py'],
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-bouncycastle',
+ '-l', 'third_party/bouncycastle'
+ ],
},
{
- # A change to a .gyp, .gypi, or to GYP itself should run the generator.
+ 'name': 'sqlite4java',
'pattern': '.',
- 'action': ['python', Var('root_dir') + '/gyp_libyuv'],
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-sqlite4java',
+ '-l', 'third_party/sqlite4java'
+ ],
},
+ {
+ 'name': 'objenesis',
+ 'pattern': '.',
+ 'action': ['python',
+ Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py',
+ 'download',
+ '-b', 'chromium-objenesis',
+ '-l', 'third_party/objenesis'
+ ],
+ },
+ {
+ # Downloads the current stable linux sysroot to build/linux/ if needed.
+ # This sysroot updates at about the same rate that the chrome build deps
+ # change. This script is a no-op except for linux users who are doing
+ # official chrome builds or cross compiling.
+ 'name': 'sysroot',
+ 'pattern': '.',
+ 'action': ['python', Var('root_dir') + '/build/linux/sysroot_scripts/install-sysroot.py',
+ '--running-as-hook'],
+ },
+ {
+ # Update the Windows toolchain if necessary.
+ 'name': 'win_toolchain',
+ 'pattern': '.',
+ 'action': ['python', Var('root_dir') + '/build/vs_toolchain.py', 'update'],
+ },
+ # Pull binutils for linux, enabled debug fission for faster linking /
+ # debugging when used with clang on Ubuntu Precise.
+ # https://code.google.com/p/chromium/issues/detail?id=352046
+ {
+ 'name': 'binutils',
+ 'pattern': Var('root_dir') + '/third_party/binutils',
+ 'action': [
+ 'python',
+ Var('root_dir') + '/third_party/binutils/download.py',
+ ],
+ },
+ {
+ # Pull clang if needed or requested via GYP_DEFINES.
+ # Note: On Win, this should run after win_toolchain, as it may use it.
+ 'name': 'clang',
+ 'pattern': '.',
+ 'action': ['python', Var('root_dir') + '/tools/clang/scripts/update.py', '--if-needed'],
+ },
+ {
+ # Update LASTCHANGE.
+ 'name': 'lastchange',
+ 'pattern': '.',
+ 'action': ['python', Var('root_dir') + '/build/util/lastchange.py',
+ '-o', Var('root_dir') + '/build/util/LASTCHANGE'],
+ },
+ # Pull GN binaries.
+ {
+ 'name': 'gn_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', Var('root_dir') + '/buildtools/win/gn.exe.sha1',
+ ],
+ },
+ {
+ 'name': 'gn_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', Var('root_dir') + '/buildtools/mac/gn.sha1',
+ ],
+ },
+ {
+ 'name': 'gn_linux64',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-gn',
+ '-s', Var('root_dir') + '/buildtools/linux64/gn.sha1',
+ ],
+ },
+ # Pull clang-format binaries using checked-in hashes.
+ {
+ 'name': 'clang_format_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', Var('root_dir') + '/buildtools/win/clang-format.exe.sha1',
+ ],
+ },
+ {
+ 'name': 'clang_format_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', Var('root_dir') + '/buildtools/mac/clang-format.sha1',
+ ],
+ },
+ {
+ 'name': 'clang_format_linux',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-clang-format',
+ '-s', Var('root_dir') + '/buildtools/linux64/clang-format.sha1',
+ ],
+ },
+ # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
+ {
+ 'name': 'luci-go_win',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=win32',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', Var('root_dir') + '/tools/luci-go/win64',
+ ],
+ },
+ {
+ 'name': 'luci-go_mac',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=darwin',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', Var('root_dir') + '/tools/luci-go/mac64',
+ ],
+ },
+ {
+ 'name': 'luci-go_linux',
+ 'pattern': '.',
+ 'action': [ 'download_from_google_storage',
+ '--no_resume',
+ '--platform=linux*',
+ '--no_auth',
+ '--bucket', 'chromium-luci',
+ '-d', Var('root_dir') + '/tools/luci-go/linux64',
+ ],
+ },
+ {
+ # Pull sanitizer-instrumented third-party libraries if requested via
+ # GYP_DEFINES.
+ # See src/third_party/instrumented_libraries/scripts/download_binaries.py.
+ # TODO(kjellander): Update comment when GYP is completely cleaned up.
+ 'name': 'instrumented_libraries',
+ 'pattern': '\\.sha1',
+ 'action': ['python', Var('root_dir') + '/third_party/instrumented_libraries/scripts/download_binaries.py'],
+ },
+ {
+ 'name': 'clang_format_merge_driver',
+ 'pattern': '.',
+ 'action': [ 'python',
+ Var('root_dir') + '/tools/clang_format_merge_driver/install_git_hook.py',
+ ],
+ },
+]
+
+recursedeps = [
+ # buildtools provides clang_format, libc++, and libc++abi.
+ Var('root_dir') + '/buildtools',
+ # android_tools manages the NDK.
+ Var('root_dir') + '/third_party/android_tools',
]
diff --git a/files/OWNERS b/files/OWNERS
index c1f7308f..e231f7b0 100644
--- a/files/OWNERS
+++ b/files/OWNERS
@@ -3,12 +3,12 @@ kjellander@google.com
# magjed@chromium.org
# torbjorng@chromium.org
-per-file *.gyp=kjellander@google.com
-per-file *.gn=kjellander@google.com
+per-file *.gyp=kjellander@chromium.org
+per-file *.gn=kjellander@chromium.org
per-file .gitignore=*
per-file AUTHORS=*
per-file DEPS=*
-per-file PRESUBMIT.py=kjellander@google.com
-per-file gyp_libyuv.py=kjellander@google.com
+per-file PRESUBMIT.py=kjellander@chromium.org
+per-file gyp_libyuv.py=kjellander@chromium.org
per-file setup_links.py=*
-per-file sync_chromium.py=kjellander@google.com
+per-file sync_chromium.py=kjellander@chromium.org
diff --git a/files/PRESUBMIT.py b/files/PRESUBMIT.py
index 58242bd9..2cf1542f 100755
--- a/files/PRESUBMIT.py
+++ b/files/PRESUBMIT.py
@@ -1,4 +1,4 @@
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
@@ -6,60 +6,67 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-import re
-import sys
+import os
-def GetDefaultTryConfigs(bots=None):
- """Returns a list of ('bot', set(['tests']), optionally filtered by [bots].
+def _RunPythonTests(input_api, output_api):
+ def join(*args):
+ return input_api.os_path.join(input_api.PresubmitLocalPath(), *args)
- For WebRTC purposes, we always return an empty list of tests, since we want
- to run all tests by default on all our trybots.
- """
- return { 'tryserver.libyuv': dict((bot, []) for bot in bots)}
+ test_directories = [
+ root for root, _, files in os.walk(join('tools_libyuv'))
+ if any(f.endswith('_test.py') for f in files)
+ ]
+ tests = []
+ for directory in test_directories:
+ tests.extend(
+ input_api.canned_checks.GetUnitTestsInDirectory(
+ input_api,
+ output_api,
+ directory,
+ whitelist=[r'.+_test\.py$']))
+ return input_api.RunTests(tests, parallel=True)
-# pylint: disable=W0613
-def GetPreferredTryMasters(project, change):
- files = change.LocalPaths()
- bots = [
- 'win',
- 'win_rel',
- 'win_x64_rel',
- 'win_x64_gn',
- 'win_x64_gn_rel',
- 'win_clang',
- 'win_clang_rel',
- 'win_x64_clang_rel',
- 'mac',
- 'mac_rel',
- 'mac_gn',
- 'mac_gn_rel',
- 'mac_asan',
- 'ios',
- 'ios_rel',
- 'ios_arm64',
- 'ios_arm64_rel',
- 'linux',
- 'linux_rel',
- 'linux_gn',
- 'linux_gn_rel',
- 'linux_memcheck',
- 'linux_tsan2',
- 'linux_asan',
- 'linux_msan',
- 'linux_ubsan',
- 'linux_ubsan_vptr',
- 'android',
- 'android_rel',
- 'android_clang',
- 'android_arm64',
- 'android_mips',
- 'android_x64',
- 'android_x86',
- 'android_gn',
- 'android_gn_rel',
- ]
- if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files):
- return {}
- return GetDefaultTryConfigs(bots)
+
+def _CommonChecks(input_api, output_api):
+ """Checks common to both upload and commit."""
+ results = []
+ results.extend(input_api.canned_checks.RunPylint(input_api, output_api,
+ black_list=(r'^base[\\\/].*\.py$',
+ r'^build[\\\/].*\.py$',
+ r'^buildtools[\\\/].*\.py$',
+ r'^ios[\\\/].*\.py$',
+ r'^out.*[\\\/].*\.py$',
+ r'^testing[\\\/].*\.py$',
+ r'^third_party[\\\/].*\.py$',
+ r'^tools[\\\/].*\.py$',
+ # TODO(kjellander): should arguably be checked.
+ r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
+ r'^xcodebuild.*[\\\/].*\.py$',),
+ disabled_warnings=['F0401', # Failed to import x
+ 'E0611', # No package y in x
+ 'W0232', # Class has no __init__ method
+ ],
+ pylintrc='pylintrc'))
+ results.extend(_RunPythonTests(input_api, output_api))
+ return results
+
+
+def CheckChangeOnUpload(input_api, output_api):
+ results = []
+ results.extend(_CommonChecks(input_api, output_api))
+ results.extend(
+ input_api.canned_checks.CheckGNFormatted(input_api, output_api))
+ return results
+
+
+def CheckChangeOnCommit(input_api, output_api):
+ results = []
+ results.extend(_CommonChecks(input_api, output_api))
+ results.extend(input_api.canned_checks.CheckOwners(input_api, output_api))
+ results.extend(input_api.canned_checks.CheckChangeWasUploaded(
+ input_api, output_api))
+ results.extend(input_api.canned_checks.CheckChangeHasDescription(
+ input_api, output_api))
+ return results
diff --git a/files/README.chromium b/files/README.chromium
index 251f8676..b502436f 100644
--- a/files/README.chromium
+++ b/files/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1602
+Version: 1645
License: BSD
License File: LICENSE
diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni
index 6d3aa1eb..0a6affbf 100644
--- a/files/build_overrides/build.gni
+++ b/files/build_overrides/build.gni
@@ -13,3 +13,34 @@
# remove this when Chromium drops 10.6 support and also requires 10.7.
mac_sdk_min_build_override = "10.11"
mac_deployment_target_build_override = "10.7"
+
+# Some non-Chromium builds don't use Chromium's third_party/binutils.
+linux_use_bundled_binutils_override = true
+
+# Variable that can be used to support multiple build scenarios, like having
+# Chromium specific targets in a client project's GN file etc.
+build_with_chromium = false
+
+# Some non-Chromium builds don't support building java targets.
+enable_java_templates = true
+
+# Allow using custom suppressions files (currently not used by libyuv).
+asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
+lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
+tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc"
+
+msan_blacklist_path =
+ rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir)
+ubsan_blacklist_path =
+ rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir)
+ubsan_vptr_blacklist_path =
+ rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir)
+
+# For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size
+# limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem
+# so we just ignore that assert. See https://crbug.com/648948 for more info.
+ignore_elf32_limitations = true
+
+# Use system Xcode installation instead of the Chromium bundled Mac toolchain,
+# since it contains only SDK 10.11, not 10.12 which WebRTC needs.
+use_system_xcode = true
diff --git a/files/build_overrides/gtest.gni b/files/build_overrides/gtest.gni
new file mode 100644
index 00000000..d3c3f68c
--- /dev/null
+++ b/files/build_overrides/gtest.gni
@@ -0,0 +1,19 @@
+# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Include support for registering main function in multi-process tests.
+gtest_include_multiprocess = true
+
+# Include support for platform-specific operations across unit tests.
+gtest_include_platform_test = true
+
+# Exclude support for testing Objective C code on OS X and iOS.
+gtest_include_objc_support = true
+
+# Exclude support for flushing coverage files on iOS.
+gtest_include_ios_coverage = true
diff --git a/files/cleanup_links.py b/files/cleanup_links.py
new file mode 100755
index 00000000..ba290789
--- /dev/null
+++ b/files/cleanup_links.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of the file from WebRTC in:
+# https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py
+
+"""Script to cleanup symlinks created from setup_links.py.
+
+Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium
+checkout which we created symlinks into. In order to do clean syncs after
+landing that change, this script cleans up any old symlinks, avoiding annoying
+manual cleanup needed in order to complete gclient sync.
+"""
+
+import logging
+import optparse
+import os
+import shelve
+import subprocess
+import sys
+
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+LINKS_DB = 'links'
+
+# Version management to make future upgrades/downgrades easier to support.
+SCHEMA_VERSION = 1
+
+class WebRTCLinkSetup(object):
+ def __init__(self, links_db, dry_run=False):
+ self._dry_run = dry_run
+ self._links_db = links_db
+
+ def CleanupLinks(self):
+ logging.debug('CleanupLinks')
+ for source, link_path in self._links_db.iteritems():
+ if source == 'SCHEMA_VERSION':
+ continue
+ if os.path.islink(link_path) or sys.platform.startswith('win'):
+ # os.path.islink() always returns false on Windows
+ # See http://bugs.python.org/issue13143.
+ logging.debug('Removing link to %s at %s', source, link_path)
+ if not self._dry_run:
+ if os.path.exists(link_path):
+ if sys.platform.startswith('win') and os.path.isdir(link_path):
+ subprocess.check_call(['rmdir', '/q', '/s', link_path],
+ shell=True)
+ else:
+ os.remove(link_path)
+ del self._links_db[source]
+
+
+def _initialize_database(filename):
+ links_database = shelve.open(filename)
+ # Wipe the database if this version of the script ends up looking at a
+ # newer (future) version of the links db, just to be sure.
+ version = links_database.get('SCHEMA_VERSION')
+ if version and version != SCHEMA_VERSION:
+ logging.info('Found database with schema version %s while this script only '
+ 'supports %s. Wiping previous database contents.', version,
+ SCHEMA_VERSION)
+ links_database.clear()
+ links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
+ return links_database
+
+
+def main():
+ parser = optparse.OptionParser()
+ parser.add_option('-d', '--dry-run', action='store_true', default=False,
+ help='Print what would be done, but don\'t perform any '
+ 'operations. This will automatically set logging to '
+ 'verbose.')
+ parser.add_option('-v', '--verbose', action='store_const',
+ const=logging.DEBUG, default=logging.INFO,
+ help='Print verbose output for debugging.')
+ options, _ = parser.parse_args()
+
+ if options.dry_run:
+ options.verbose = logging.DEBUG
+ logging.basicConfig(format='%(message)s', level=options.verbose)
+
+ # Work from the root directory of the checkout.
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ os.chdir(script_dir)
+
+ # The database file gets .db appended on some platforms.
+ db_filenames = [LINKS_DB, LINKS_DB + '.db']
+ if any(os.path.isfile(f) for f in db_filenames):
+ links_database = _initialize_database(LINKS_DB)
+ try:
+ symlink_creator = WebRTCLinkSetup(links_database, options.dry_run)
+ symlink_creator.CleanupLinks()
+ finally:
+ for f in db_filenames:
+ if os.path.isfile(f):
+ os.remove(f)
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/files/codereview.settings b/files/codereview.settings
index 9b538069..00ba1d37 100644
--- a/files/codereview.settings
+++ b/files/codereview.settings
@@ -1,12 +1,6 @@
-# This file is used by gcl to get repository specific information.
+# This file is used by git cl to get repository specific information.
CODE_REVIEW_SERVER: codereview.chromium.org
-#CC_LIST:
-VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
-#STATUS:
-FORCE_HTTPS_COMMIT_URL: True
+GERRIT_HOST: True
PROJECT: libyuv
TRY_ON_UPLOAD: False
-TRYSERVER_ROOT: src
-TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
-#GITCL_PREUPLOAD:
-#GITCL_PREDCOMMIT:
+VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/files/docs/deprecated_builds.md b/files/docs/deprecated_builds.md
new file mode 100644
index 00000000..f623e50c
--- /dev/null
+++ b/files/docs/deprecated_builds.md
@@ -0,0 +1,440 @@
+# Deprecated Builds
+
+Older documentation on build configs which are no longer supported.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+ gclient config https://chromium.googlesource.com/libyuv/libyuv
+ gclient sync
+
+
+Then you'll get a .gclient file like:
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ },
+ "safesync_url": "",
+ },
+ ];
+ target_os = ["android", "unix"];
+
+Then run:
+
+ export GYP_DEFINES="OS=android"
+ gclient sync
+
+Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following:
+
+ cd chromium/src
+ ./build/android/play_services/update.py download
+ cd ../..
+
+For Windows the gclient sync must be done from an Administrator command prompt.
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+
+ git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+ set GYP_DEFINES=target_arch=ia32
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -j7 -C out\Release
+ ninja -j7 -C out\Debug
+
+ set GYP_DEFINES=target_arch=x64
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -C out\Debug_x64
+ ninja -C out\Release_x64
+
+#### Building with clangcl
+ set GYP_DEFINES=clang=1 target_arch=ia32
+ call python tools\clang\scripts\update.py
+ call python gyp_libyuv -fninja libyuv_test.gyp
+ ninja -C out\Debug
+ ninja -C out\Release
+
+### OSX
+
+Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+
+ GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+ GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+armv7
+
+ GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+arm64
+
+ GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+both armv7 and arm64 (fat)
+
+ GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+ ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+ ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+simulator
+
+ GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
+ ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
+ ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+ GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+arm64
+
+ GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+ia32
+
+ GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+ GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+
+mipsel
+
+ GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+ ninja -j7 -C out/Debug yuv_unittest_apk
+ ninja -j7 -C out/Release yuv_unittest_apk
+
+arm32 disassembly:
+
+ third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+
+arm64 disassembly:
+
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+
+Running tests:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+
+Running test with C code:
+
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+
+#### Building with GN
+
+ gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+ ninja -C out/Release
+ ninja -C out/Debug
+
+### Building Offical with GN
+
+ gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+ ninja -C out/Official
+
+#### Building mips with GN
+
+mipsel
+ gn gen out/Default "--args=is_debug=false target_cpu=\"mipsel\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+ ninja -C out/Default
+
+mips64el
+ gn gen out/Default "--args=is_debug=false target_cpu=\"mips64el\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false"
+ ninja -C out/Default
+
+### Linux
+
+ GYP_DEFINES="target_arch=x64" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+ GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
+ ninja -j7 -C out/Debug
+ ninja -j7 -C out/Release
+
+#### CentOS
+
+On CentOS 32 bit the following work around allows a sync:
+
+ export GYP_DEFINES="host_arch=ia32"
+ gclient sync
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
+
+ gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+
+### Build targets
+
+ ninja -C out/Debug libyuv
+ ninja -C out/Debug libyuv_unittest
+ ninja -C out/Debug compare
+ ninja -C out/Debug convert
+ ninja -C out/Debug psnr
+ ninja -C out/Debug cpuid
+
+
+## Building the Library with make
+
+### Linux
+
+ make -j7 V=1 -f linux.mk
+ make -j7 V=1 -f linux.mk clean
+ make -j7 V=1 -f linux.mk CXX=clang++
+
+## Building the Library with cmake
+
+Install cmake: http://www.cmake.org/
+
+Default debug build:
+
+ mkdir out
+ cd out
+ cmake ..
+ cmake --build .
+
+Release build/install
+
+ mkdir out
+ cd out
+ cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+ cmake --build . --config Release
+ sudo cmake --build . --target install --config Release
+
+### Windows 8 Phone
+
+Pre-requisite:
+
+* Install Visual Studio 2012 and Arm to your environment.<br>
+
+Then:
+
+ call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+or with Visual Studio 2013:
+
+ call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+ nmake /f winarm.mk clean
+ nmake /f winarm.mk
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
+
+ gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+### 64 bit Windows
+
+ set GYP_DEFINES=target_arch=x64
+ gclient runhooks V=1
+
+### ARM Linux
+
+ export GYP_DEFINES="target_arch=arm"
+ export CROSSTOOL=`<path>`/arm-none-linux-gnueabi
+ export CXX=$CROSSTOOL-g++
+ export CC=$CROSSTOOL-gcc
+ export AR=$CROSSTOOL-ar
+ export AS=$CROSSTOOL-as
+ export RANLIB=$CROSSTOOL-ranlib
+ gclient runhooks
+
+## Running Unittests
+
+### Windows
+
+ out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+ out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g.
+
+ out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+ c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+
+
+## Memory tools
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+ set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
+ call python gyp_libyuv -fninja -G msvs_version=2013
+ ninja -C out\Debug
+ drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
+
+### Running UBSan
+
+See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
+
+Sanitizers available: TSan, MSan, ASan, UBSan, LSan
+
+ GYP_DEFINES='ubsan=1' gclient runhooks
+ ninja -C out/Release
+
+### Running Valgrind memcheck
+
+Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
+
+[1]: http://valgrind.org
+
+ solutions = [
+ { "name" : "libyuv",
+ "url" : "https://chromium.googlesource.com/libyuv/libyuv",
+ "deps_file" : "DEPS",
+ "managed" : True,
+ "custom_deps" : {
+ "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
+ },
+ "safesync_url": "",
+ },
+ ]
+
+Then run:
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+
+For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
+
+### Running Thread Sanitizer (TSan)
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
+
+### Running Address Sanitizer (ASan)
+
+ GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
+ ninja -C out/Debug
+ valgrind out/Debug/libyuv_unittest
+
+For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
+
+## Benchmarking
+
+The unittests can be used to benchmark.
+
+### Windows
+
+ set LIBYUV_WIDTH=1280
+ set LIBYUV_HEIGHT=720
+ set LIBYUV_REPEAT=999
+ set LIBYUV_FLAGS=-1
+ out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
+
+
+### Linux and Mac
+
+ LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+ libyuvTest.I420ToARGB_Opt (547 ms)
+
+Indicates 0.547 ms/frame for 1280 x 720.
+
+## Making a change
+
+ gclient sync
+ git checkout -b mycl -t origin/master
+ git pull
+ <edit files>
+ git add -u
+ git commit -m "my change"
+ git cl lint
+ git cl try
+ git cl upload -r a-reviewer@chomium.org -s
+ <once approved..>
+ git cl land
diff --git a/files/docs/formats.md b/files/docs/formats.md
index a7cfed82..cddfe027 100644
--- a/files/docs/formats.md
+++ b/files/docs/formats.md
@@ -37,20 +37,18 @@ This is how OSX formats map to libyuv
The following is extracted from video_common.h as a complete list of formats supported by libyuv.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
- FOURCC_I411 = FOURCC('I', '4', '1', '1'),
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
- // 2 Secondary YUV formats: row biplanar.
+ // 1 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -102,6 +100,15 @@ The following is extracted from video_common.h as a complete list of formats sup
// 1 Auxiliary compressed YUV format set aside for capturer.
FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+# Planar YUV
+ The following formats contains a full size Y plane followed by 1 or 2
+ planes for UV: I420, I422, I444, I400, NV21, NV12, I400
+ The size (subsampling) of the UV varies.
+ I420, NV12 and NV21 are half width, half height
+ I422, NV16 and NV61 are half width, full height
+ I444, NV24 and NV42 are full width, full height
+ I400 and J400 have no chroma channel.
+
# The ARGB FOURCC
There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers.
diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md
index 7cd56167..46c591b6 100644
--- a/files/docs/getting_started.md
+++ b/files/docs/getting_started.md
@@ -14,7 +14,6 @@ Create a working directory, enter it, and run:
gclient config https://chromium.googlesource.com/libyuv/libyuv
gclient sync
-
Then you'll get a .gclient file like:
solutions = [
@@ -28,7 +27,6 @@ Then you'll get a .gclient file like:
},
];
-
For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
@@ -36,7 +34,6 @@ Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/ma
### Android
For Android add `;target_os=['android'];` to your Linux .gclient
-
solutions = [
{ "name" : "libyuv",
"url" : "https://chromium.googlesource.com/libyuv/libyuv",
@@ -47,7 +44,7 @@ For Android add `;target_os=['android'];` to your Linux .gclient
"safesync_url": "",
},
];
- target_os = ["android", "unix"];
+ target_os = ["android", "linux"];
Then run:
@@ -55,6 +52,7 @@ Then run:
gclient sync
Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following:
+
cd chromium/src
./build/android/play_services/update.py download
cd ../..
@@ -64,6 +62,7 @@ For Windows the gclient sync must be done from an Administrator command prompt.
The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
To get just the source (not buildable):
+
git clone https://chromium.googlesource.com/libyuv/libyuv
@@ -71,64 +70,61 @@ To get just the source (not buildable):
### Windows
- set GYP_DEFINES=target_arch=ia32
- call python gyp_libyuv -fninja -G msvs_version=2013
- ninja -j7 -C out\Release
- ninja -j7 -C out\Debug
+ call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+ call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+ call gn gen out/Release "--args=is_debug=false target_cpu=\"x64\""
+ call gn gen out/Debug "--args=is_debug=true target_cpu=\"x64\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
- set GYP_DEFINES=target_arch=x64
- call python gyp_libyuv -fninja -G msvs_version=2013
- ninja -C out\Debug_x64
- ninja -C out\Release_x64
+#### Building with clang-cl
-#### Building with clangcl
- set GYP_DEFINES=clang=1 target_arch=ia32 libyuv_enable_svn=1
- set LLVM_REPO_URL=svn://svn.chromium.org/llvm-project
+ set GYP_DEFINES=clang=1 target_arch=ia32
call python tools\clang\scripts\update.py
- call python gyp_libyuv -fninja libyuv_test.gyp
- ninja -C out\Debug
- ninja -C out\Release
-### OSX
+ call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x86\""
+ call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x86\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
-Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+ call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x64\""
+ call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x64\""
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
- GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
- ninja -j7 -C out/Debug
- ninja -j7 -C out/Release
+### macOS and Linux
- GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
- ninja -j7 -C out/Debug
- ninja -j7 -C out/Release
+ gn gen out/Release "--args=is_debug=false"
+ gn gen out/Debug "--args=is_debug=true"
+ ninja -v -C out/Release
+ ninja -v -C out/Debug
+
+### Building Offical with GN
+
+ gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+ ninja -C out/Official
### iOS
http://www.chromium.org/developers/how-tos/build-instructions-ios
Add to .gclient last line: `target_os=['ios'];`
-armv7
-
- GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
- ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
- ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
-
arm64
- GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
- ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
- ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
-
-both armv7 and arm64 (fat)
+ gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
- GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
- ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
- ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+ios simulator
-simulator
-
- GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
- ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
- ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+ gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
### Android
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
@@ -137,90 +133,56 @@ Add to .gclient last line: `target_os=['android'];`
armv7
- GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
- ninja -j7 -C out/Debug libyuv_unittest_apk
- ninja -j7 -C out/Release libyuv_unittest_apk
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
arm64
- GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
- ninja -j7 -C out/Debug libyuv_unittest_apk
- ninja -j7 -C out/Release libyuv_unittest_apk
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
ia32
- GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
- ninja -j7 -C out/Debug libyuv_unittest_apk
- ninja -j7 -C out/Release libyuv_unittest_apk
-
- GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
- ninja -j7 -C out/Debug libyuv_unittest_apk
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"x86\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
mipsel
- GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
- ninja -j7 -C out/Debug libyuv_unittest_apk
- ninja -j7 -C out/Release libyuv_unittest_apk
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
-arm32 disassembly:
+arm disassembly:
- third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
-arm64 disassembly:
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
- third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+ third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
Running tests:
- util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
Running test as benchmark:
- util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1"
Running test with C code:
- util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
-
-#### Building with GN
-
- gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
- gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
- ninja -C out/Release
- ninja -C out/Debug
-
-### Building Offical with GN
-
- gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
- ninja -C out/Official
-
-### Linux
-
- GYP_DEFINES="target_arch=x64" ./gyp_libyuv
- ninja -j7 -C out/Debug
- ninja -j7 -C out/Release
-
- GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
- ninja -j7 -C out/Debug
- ninja -j7 -C out/Release
-
-#### CentOS
-
-On CentOS 32 bit the following work around allows a sync:
-
- export GYP_DEFINES="host_arch=ia32"
- gclient sync
-
-### Windows Shared Library
-
-Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
-
- gclient runhooks
-
-After this command follow the building the library instructions above.
-
-If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
-
+ build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
### Build targets
@@ -231,27 +193,33 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium
ninja -C out/Debug psnr
ninja -C out/Debug cpuid
+### ARM Linux
+
+ gn gen out/Release "--args=is_debug=false target_cpu=\"arm64\""
+ gn gen out/Debug "--args=is_debug=true target_cpu=\"arm64\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
## Building the Library with make
### Linux
- make -j7 V=1 -f linux.mk
- make -j7 V=1 -f linux.mk clean
- make -j7 V=1 -f linux.mk CXX=clang++
+ make V=1 -f linux.mk
+ make V=1 -f linux.mk clean
+ make V=1 -f linux.mk CXX=clang++
-## Building the Library with cmake
+## Building the library with cmake
Install cmake: http://www.cmake.org/
-Default debug build:
+### Default debug build:
mkdir out
cd out
cmake ..
cmake --build .
-Release build/install
+### Release build/install
mkdir out
cd out
@@ -259,47 +227,31 @@ Release build/install
cmake --build . --config Release
sudo cmake --build . --target install --config Release
-### Windows 8 Phone
-
-Pre-requisite:
-
-* Install Visual Studio 2012 and Arm to your environment.<br>
-
-Then:
-
- call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
-
-or with Visual Studio 2013:
-
- call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
- nmake /f winarm.mk clean
- nmake /f winarm.mk
-
-### Windows Shared Library
+### Build RPM/DEB packages
-Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
-
- gclient runhooks
-
-After this command follow the building the library instructions above.
+ mkdir out
+ cd out
+ cmake -DCMAKE_BUILD_TYPE=Release ..
+ make -j4
+ make package
-If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+## Setup for Arm Cross compile
-### 64 bit Windows
+See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
- set GYP_DEFINES=target_arch=x64
- gclient runhooks V=1
+ sudo apt-get install ssh dkms build-essential linux-headers-generic
+ sudo apt-get install kdevelop cmake git subversion
+ sudo apt-get install graphviz doxygen doxygen-gui
+ sudo apt-get install manpages manpages-dev manpages-posix manpages-posix-dev
+ sudo apt-get install libboost-all-dev libboost-dev libssl-dev
+ sudo apt-get install rpm terminator fish
+ sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
-### ARM Linux
+### Build psnr tool
- export GYP_DEFINES="target_arch=arm"
- export CROSSTOOL=`<path>`/arm-none-linux-gnueabi
- export CXX=$CROSSTOOL-g++
- export CC=$CROSSTOOL-gcc
- export AR=$CROSSTOOL-ar
- export AS=$CROSSTOOL-as
- export RANLIB=$CROSSTOOL-ranlib
- gclient runhooks
+ cd util
+ arm-linux-gnueabihf-g++ psnr_main.cc psnr.cc ssim.cc -o psnr
+ arm-linux-gnueabihf-objdump -d psnr
## Running Unittests
@@ -317,113 +269,29 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium
Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g.
- out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+ out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
## CPU Emulator tools
### Intel SDE (Software Development Emulator)
-Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+Pre-requisite: Install IntelSDE: http://software.intel.com/en-us/articles/intel-software-development-emulator
Then run:
- c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+ c:\intelsde\sde -hsw -- out\Release\libyuv_unittest.exe --gtest_filter=*
+
+ ~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt
+## Sanitizers
-## Memory tools
+ gn gen out/Debug "--args=is_debug=true is_asan=true"
+ ninja -v -C out/Debug
+
+ Sanitizers available: tsan, msan, asan, ubsan, lsan
### Running Dr Memory memcheck for Windows
Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
- set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
- call python gyp_libyuv -fninja -G msvs_version=2013
- ninja -C out\Debug
drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
-
-### Running UBSan
-
-See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
-
-Sanitizers available: TSan, MSan, ASan, UBSan, LSan
-
- GYP_DEFINES='ubsan=1' gclient runhooks
- ninja -C out/Release
-
-### Running Valgrind memcheck
-
-Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
-
-[1]: http://valgrind.org
-
- solutions = [
- { "name" : "libyuv",
- "url" : "https://chromium.googlesource.com/libyuv/libyuv",
- "deps_file" : "DEPS",
- "managed" : True,
- "custom_deps" : {
- "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
- },
- "safesync_url": "",
- },
- ]
-
-Then run:
-
- GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
- ninja -C out/Debug
- valgrind out/Debug/libyuv_unittest
-
-
-For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
-
-### Running Thread Sanitizer (TSan)
-
- GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
- ninja -C out/Debug
- valgrind out/Debug/libyuv_unittest
-
-For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
-
-### Running Address Sanitizer (ASan)
-
- GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
- ninja -C out/Debug
- valgrind out/Debug/libyuv_unittest
-
-For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
-
-## Benchmarking
-
-The unittests can be used to benchmark.
-
-### Windows
-
- set LIBYUV_WIDTH=1280
- set LIBYUV_HEIGHT=720
- set LIBYUV_REPEAT=999
- set LIBYUV_FLAGS=-1
- out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
-
-
-### Linux and Mac
-
- LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
-
- libyuvTest.I420ToARGB_Opt (547 ms)
-
-Indicates 0.547 ms/frame for 1280 x 720.
-
-## Making a change
-
- gclient sync
- git checkout -b mycl -t origin/master
- git pull
- <edit files>
- git add -u
- git commit -m "my change"
- git cl lint
- git cl try
- git cl upload -r a-reviewer@chomium.org -s
- <once approved..>
- git cl land
diff --git a/files/gyp_libyuv.py b/files/gyp_libyuv.py
index ac42038d..bb32ec39 100644
--- a/files/gyp_libyuv.py
+++ b/files/gyp_libyuv.py
@@ -9,7 +9,7 @@
# be found in the AUTHORS file in the root of the source tree.
-# This script is a modified copy of the src/build/gyp_chromium.py file.
+# This script is a modified copy of the src/build/gyp_chromium.py file.
# It is needed for parallel processing.
# This file is (possibly, depending on python version) imported by
diff --git a/files/include/libyuv.h b/files/include/libyuv.h
index de652836..aeffd5ef 100644
--- a/files/include/libyuv.h
+++ b/files/include/libyuv.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_H_
#define INCLUDE_LIBYUV_H_
#include "libyuv/basic_types.h"
@@ -29,4 +29,4 @@
#include "libyuv/version.h"
#include "libyuv/video_common.h"
-#endif // INCLUDE_LIBYUV_H_ NOLINT
+#endif // INCLUDE_LIBYUV_H_
diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h
index beb750ba..7d98bb93 100644
--- a/files/include/libyuv/basic_types.h
+++ b/files/include/libyuv/basic_types.h
@@ -8,12 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
#include <stddef.h> // for NULL, size_t
-#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
#include <sys/types.h> // for uintptr_t on x86
#else
#include <stdint.h> // for uintptr_t
@@ -26,31 +26,31 @@
typedef unsigned __int64 uint64;
typedef __int64 int64;
#ifndef INT64_C
-#define INT64_C(x) x ## I64
+#define INT64_C(x) x##I64
#endif
#ifndef UINT64_C
-#define UINT64_C(x) x ## UI64
+#define UINT64_C(x) x##UI64
#endif
#define INT64_F "I64"
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
-typedef long int64; // NOLINT
+typedef long int64; // NOLINT
#ifndef INT64_C
-#define INT64_C(x) x ## L
+#define INT64_C(x) x##L
#endif
#ifndef UINT64_C
-#define UINT64_C(x) x ## UL
+#define UINT64_C(x) x##UL
#endif
#define INT64_F "l"
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
-typedef long long int64; // NOLINT
+typedef long long int64; // NOLINT
#ifndef INT64_C
-#define INT64_C(x) x ## LL
+#define INT64_C(x) x##LL
#endif
#ifndef UINT64_C
-#define UINT64_C(x) x ## ULL
+#define UINT64_C(x) x##ULL
#endif
#define INT64_F "ll"
#endif // __LP64__
@@ -58,15 +58,15 @@ typedef long long int64; // NOLINT
typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16; // NOLINT
-typedef short int16; // NOLINT
+typedef short int16; // NOLINT
typedef unsigned char uint8;
typedef signed char int8;
#endif // INT_TYPES_DEFINED
#endif // GG_LONGLONG
// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || \
- defined(__i386__) || defined(_M_IX86)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86)
#define CPU_X86 1
#endif
// Detect compiler is for ARM.
@@ -76,12 +76,12 @@ typedef signed char int8;
#ifndef ALIGNP
#ifdef __cplusplus
-#define ALIGNP(p, t) \
- (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
- ((t) - 1)) & ~((t) - 1))))
+#define ALIGNP(p, t) \
+ reinterpret_cast<uint8*>( \
+ ((reinterpret_cast<uintptr_t>(p) + ((t)-1)) & ~((t)-1)))
#else
#define ALIGNP(p, t) \
- ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */
+ (uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */
#endif
#endif
@@ -95,9 +95,9 @@ typedef signed char int8;
#define LIBYUV_API
#endif // LIBYUV_BUILDING_SHARED_LIBRARY
#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
- (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
- defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__ ((visibility ("default")))
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
#else
#define LIBYUV_API
#endif // __GNUC__
@@ -108,11 +108,10 @@ typedef signed char int8;
#define LIBYUV_TRUE 1
// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
- defined(__i386__) || defined(_M_IX86) || \
- defined(__arm__) || defined(_M_ARM) || \
- (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define LIBYUV_LITTLE_ENDIAN
#endif
-#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h
index 08b2bb2e..4deca97f 100644
--- a/files/include/libyuv/compare.h
+++ b/files/include/libyuv/compare.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
#define INCLUDE_LIBYUV_COMPARE_H_
#include "libyuv/basic_types.h"
@@ -29,13 +29,15 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
- const uint8* src_b, int count);
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count);
LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height);
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
static const int kMaxPsnr = 128;
@@ -43,36 +45,56 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count);
LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height);
+double CalcFramePsnr(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height);
+double I420Psnr(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height);
+double CalcFrameSsim(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height);
LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height);
+double I420Ssim(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT
+#endif // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/files/include/libyuv/compare_row.h b/files/include/libyuv/compare_row.h
index 38a957b2..7abc2d4a 100644
--- a/files/include/libyuv/compare_row.h
+++ b/files/include/libyuv/compare_row.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_
#define INCLUDE_LIBYUV_COMPARE_ROW_H_
#include "libyuv/basic_types.h"
@@ -30,8 +30,8 @@ extern "C" {
#endif
// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
- defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@@ -42,8 +42,8 @@ extern "C" {
#endif // clang >= 3.4
#endif // __clang__
-#if !defined(LIBYUV_DISABLE_X86) && \
- defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
#endif
@@ -81,4 +81,4 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ NOLINT
+#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index a2cdc571..f096d193 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -8,13 +8,18 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -22,184 +27,295 @@ extern "C" {
// Convert I444 to I420.
LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I444ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert I422 to I420.
LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert I411 to I420.
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I422ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Copy I420 to I420.
#define I420ToI420 I420Copy
LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I420Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert I400 (grey) to I420.
LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I400ToI420(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int NV12ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert NV21 to I420.
LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int NV21ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert YUY2 to I420.
LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int YUY2ToI420(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert UYVY to I420.
LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int UYVYToI420(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert M420 to I420.
LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int M420ToI420(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int pixel_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// BGRA little endian (argb in memory) to I420.
LIBYUV_API
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int BGRAToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// ABGR little endian (rgba in memory) to I420.
LIBYUV_API
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ABGRToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGBA little endian (abgr in memory) to I420.
LIBYUV_API
-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int RGBAToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGB little endian (bgr in memory) to I420.
LIBYUV_API
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int RGB24ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGB big endian (rgb in memory) to I420.
LIBYUV_API
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int RAWToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGB16 (RGBP fourcc) little endian to I420.
LIBYUV_API
-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int RGB565ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGB15 (RGBO fourcc) little endian to I420.
LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGB1555ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// RGB12 (R444 fourcc) little endian to I420.
LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGB4444ToI420(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
-int MJPGToI420(const uint8* sample, size_t sample_size,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_width, int src_height,
- int dst_width, int dst_height);
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
// Query size of MJPG in pixels.
LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
- int* width, int* height);
+int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height);
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
@@ -225,13 +341,20 @@ int MJPGSize(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
-int ConvertToI420(const uint8* src_frame, size_t src_size,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
+int ConvertToI420(const uint8* src_frame,
+ size_t src_size,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
enum RotationMode rotation,
uint32 format);
@@ -240,4 +363,4 @@ int ConvertToI420(const uint8* src_frame, size_t src_size,
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT
+#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
index 996f4768..f43a5060 100644
--- a/files/include/libyuv/convert_argb.h
+++ b/files/include/libyuv/convert_argb.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h"
@@ -30,245 +30,385 @@ extern "C" {
// Copy ARGB to ARGB.
LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert I420 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Duplicate prototype for function in convert_from.h for remoting.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert I422 to ARGB.
LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert I444 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert J444 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int J444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert I444 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I444ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int attenuate);
+int I420AlphaToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height, int attenuate);
+int I420AlphaToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int J400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Alias.
#define YToARGB I400ToARGB
// Convert NV12 to ARGB.
LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int NV12ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert NV21 to ARGB.
LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int NV21ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert M420 to ARGB.
LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int M420ToARGB(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert YUY2 to ARGB.
LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int YUY2ToARGB(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert UYVY to ARGB.
LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int UYVYToARGB(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert J420 to ARGB.
LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int J420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert J422 to ARGB.
LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int J422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert J420 to ABGR.
LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int J420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert J422 to ABGR.
LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int J422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert H420 to ARGB.
LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int H420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert H422 to ARGB.
LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int H422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert H420 to ABGR.
LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int H420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert H422 to ABGR.
LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int H422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int BGRAToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// ABGR little endian (rgba in memory) to ARGB.
LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ABGRToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// RGBA little endian (abgr in memory) to ARGB.
LIBYUV_API
-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int RGBAToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Deprecated function name.
#define BG24ToARGB RGB24ToARGB
// RGB little endian (bgr in memory) to ARGB.
LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int RGB24ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// RGB big endian (rgb in memory) to ARGB.
LIBYUV_API
-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int RAWToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int RGB565ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// RGB15 (RGBO fourcc) little endian to ARGB.
LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGB1555ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// RGB12 (R444 fourcc) little endian to ARGB.
LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGB4444ToARGB(const uint8* src_frame,
+ int src_stride_frame,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
- uint8* dst_argb, int dst_stride_argb,
- int src_width, int src_height,
- int dst_width, int dst_height);
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+#endif
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
@@ -293,11 +433,16 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
// "format" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
-int ConvertToARGB(const uint8* src_frame, size_t src_size,
- uint8* dst_argb, int dst_stride_argb,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
+int ConvertToARGB(const uint8* src_frame,
+ size_t src_size,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
enum RotationMode rotation,
uint32 format);
@@ -306,4 +451,4 @@ int ConvertToARGB(const uint8* src_frame, size_t src_size,
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h
index 7522ea5c..7ddebd4f 100644
--- a/files/include/libyuv/convert_from.h
+++ b/files/include/libyuv/convert_from.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
#define INCLUDE_LIBYUV_CONVERT_FROM_H_
#include "libyuv/basic_types.h"
@@ -24,151 +24,237 @@ extern "C" {
// I420Copy in convert to I420ToI420.
LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I420ToI422(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I420ToI444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int I400Copy(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
+int I420ToNV12(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
+int I420ToNV21(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I420ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height);
+int I420ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToRGB24(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToRAW(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
-//LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- const uint8* dither4x4, int width, int height);
+int I420ToRGB565Dither(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ const uint8* dither4x4,
+ int width,
+ int height);
LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToARGB1555(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I420ToARGB4444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
- const uint8* u, int u_stride,
- const uint8* v, int v_stride,
- uint8* dst_sample, int dst_sample_stride,
- int width, int height,
+int ConvertFromI420(const uint8* y,
+ int y_stride,
+ const uint8* u,
+ int u_stride,
+ const uint8* v,
+ int v_stride,
+ uint8* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
uint32 format);
#ifdef __cplusplus
@@ -176,4 +262,4 @@ int ConvertFromI420(const uint8* y, int y_stride,
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h
index 1df53200..50722d76 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/files/include/libyuv/convert_from_argb.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
#include "libyuv/basic_types.h"
@@ -21,45 +21,66 @@ extern "C" {
// Copy ARGB to ARGB.
#define ARGBToARGB ARGBCopy
LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert ARGB To BGRA.
LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height);
+int ARGBToBGRA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
// Convert ARGB To ABGR.
LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int ARGBToABGR(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert ARGB To RGBA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height);
+int ARGBToRGBA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
// Convert ARGB To RGB24.
LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height);
+int ARGBToRGB24(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
// Convert ARGB To RAW.
LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb, int dst_stride_rgb,
- int width, int height);
+int ARGBToRAW(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height);
// Convert ARGB To RGB565.
LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height);
+int ARGBToRGB565(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
@@ -67,124 +88,178 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- const uint8* dither4x4, int width, int height);
+int ARGBToRGB565Dither(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height);
+int ARGBToARGB1555(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
// Convert ARGB To ARGB4444.
LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height);
+int ARGBToARGB4444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
// Convert ARGB To I444.
LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToI444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert ARGB To I422.
LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToI422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert ARGB To I420. (also in convert.h)
LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToI420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToJ420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert ARGB to J422.
LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int ARGBToJ422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert ARGB to J400. (JPeg full range).
LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- int width, int height);
+int ARGBToJ400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
// Convert ARGB to I400.
LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int ARGBToI400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
- uint8* dst_g, int dst_stride_g,
- int width, int height);
+int ARGBToG(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_g,
+ int dst_stride_g,
+ int width,
+ int height);
// Convert ARGB To NV12.
LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
+int ARGBToNV12(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
// Convert ARGB To NV21.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
// Convert ARGB To NV21.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
// Convert ARGB To YUY2.
LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height);
+int ARGBToYUY2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
// Convert ARGB To UYVY.
LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height);
+int ARGBToUYVY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index dfb7445e..bcddb32e 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
#define INCLUDE_LIBYUV_CPU_ID_H_
#include "libyuv/basic_types.h"
@@ -31,17 +31,20 @@ static const int kCpuHasX86 = 0x10;
static const int kCpuHasSSE2 = 0x20;
static const int kCpuHasSSSE3 = 0x40;
static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasSSE42 = 0x100; // unused at this time.
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+static const int kCpuHasF16C = 0x4000;
+
+// 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMSA = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
@@ -77,4 +80,4 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT
+#endif // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/files/include/libyuv/macros_msa.h b/files/include/libyuv/macros_msa.h
new file mode 100644
index 00000000..61be352e
--- /dev/null
+++ b/files/include/libyuv/macros_msa.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
+#define INCLUDE_LIBYUV_MACROS_MSA_H_
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include <msa.h>
+#include <stdint.h>
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val_m; \
+ asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint64 val_m = 0; \
+ asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val0_m, val1_m; \
+ uint64 val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64)(val1_m); /* NOLINT */ \
+ val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SW(val, pdst) \
+ ({ \
+ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val_m = (val); \
+ asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_m] "r"(val_m)); \
+ })
+
+#if (__mips == 64)
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint64_t val_m = (val); \
+ asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
+ : [pdst_sd_m] "=m"(*pdst_sd_m) \
+ : [val_m] "r"(val_m)); \
+ })
+#else // !(__mips == 64)
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val0_m, val1_m; \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ })
+#endif // !(__mips == 64)
+#else // !(__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val_m; \
+ asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint64 val_m = 0; \
+ asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
+ uint32 val0_m, val1_m; \
+ uint64 val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64)(val1_m); /* NOLINT */ \
+ val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SW(val, pdst) \
+ ({ \
+ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val_m = (val); \
+ asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
+ : [pdst_sw_m] "=m"(*pdst_sw_m) \
+ : [val_m] "r"(val_m)); \
+ })
+
+#define SD(val, pdst) \
+ ({ \
+ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+ uint32_t val0_m, val1_m; \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ SW(val0_m, pdst_sd_m); \
+ SW(val1_m, pdst_sd_m + 4); \
+ })
+#endif // (__mips_isa_rev >= 6)
+
+// TODO(fbarchard): Consider removing __VAR_ARGS versions.
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+
+/* Description : Load two vectors with 16 'byte' sized elements
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Load 16 byte elements in 'out0' from (psrc)
+ Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+ }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store two vectors with stride each having 16 'byte' sized
+ elements
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+
+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+
+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+
+#endif // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h
index 8423121d..8a4f2822 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/files/include/libyuv/mjpeg_decoder.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
#include "libyuv/basic_types.h"
@@ -37,7 +37,6 @@ static const uint32 kUnknownDataSize = 0xFFFFFFFF;
enum JpegSubsamplingType {
kJpegYuv420,
kJpegYuv422,
- kJpegYuv411,
kJpegYuv444,
kJpegYuv400,
kJpegUnknown
@@ -145,12 +144,16 @@ class LIBYUV_API MJpegDecoder {
// callback function. Each call will get the data for a whole number of
// image scanlines.
// TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
- LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
- int dst_width, int dst_height);
+ LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height);
// The helper function which recognizes the jpeg sub-sampling type.
static JpegSubsamplingType JpegSubsamplingTypeHelper(
- int* subsample_x, int* subsample_y, int number_of_components);
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components);
private:
void AllocOutputBuffers(int num_outbufs);
@@ -189,4 +192,4 @@ class LIBYUV_API MJpegDecoder {
} // namespace libyuv
#endif // __cplusplus
-#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 881b0c5c..040839c2 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#include "libyuv/basic_types.h"
@@ -24,86 +24,164 @@ extern "C" {
// Copy a plane of data.
LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+void CopyPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
- uint16* dst_y, int dst_stride_y,
- int width, int height);
+void CopyPlane_16(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
- int width, int height,
+void SetPlane(uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
uint32 value);
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Copy I400. Supports inverting.
LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int I400ToI400(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422.
#define I422ToI422 I422Copy
LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I422Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Copy I444 to I444.
#define I444ToI444 I444Copy
LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I444Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert YUY2 to I422.
LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int YUY2ToI422(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Convert UYVY to I422.
LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
+int UYVYToI422(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int I420ToI400(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Alias
#define J420ToJ400 I420ToI400
@@ -111,13 +189,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
// I420 mirror.
LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I420Mirror(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Alias
#define I400ToI400Mirror I400Mirror
@@ -125,87 +210,139 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
// I400 mirror. A single plane is mirrored horizontally.
// Pass negative height to achieve 180 degree rotation.
LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int I400Mirror(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBMirror(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert NV12 to RGB565.
LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height);
+int NV12ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height);
+int I422ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
// Convert I422 to ABGR.
LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
+int I422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
// Convert I422 to RGBA.
LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height);
+int I422ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height);
+int RAWToRGB24(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
// Draw a rectangle into I420.
LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int x, int y, int width, int height,
- int value_y, int value_u, int value_v);
+int I420Rect(uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v);
// Draw a rectangle into ARGB.
LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
- int x, int y, int width, int height, uint32 value);
+int ARGBRect(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32 value);
// Convert ARGB to gray scale ARGB.
LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBGrayTo(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Make a rectangle of ARGB gray scale.
LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
- int x, int y, int width, int height);
+int ARGBGray(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height);
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
- int x, int y, int width, int height);
+int ARGBSepia(uint8* dst_argb,
+ int dst_stride_argb,
+ int x,
+ int y,
+ int width,
+ int height);
// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
@@ -214,10 +351,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce R of the output.
// The last 4 coefficients apply to B, G, R, A and produce A of the output.
LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const int8* matrix_argb,
- int width, int height);
+ int width,
+ int height);
// Deprecated. Use ARGBColorMatrix instead.
// Apply a matrix rotation to each ARGB pixel.
@@ -226,32 +366,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The last 4 coefficients apply to B, G, R, A and produce R of the output.
LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int RGBColorMatrix(uint8* dst_argb,
+ int dst_stride_argb,
const int8* matrix_rgb,
- int x, int y, int width, int height);
+ int x,
+ int y,
+ int width,
+ int height);
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
const uint8* table_argb,
- int x, int y, int width, int height);
+ int x,
+ int y,
+ int width,
+ int height);
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int RGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
const uint8* table_argb,
- int x, int y, int width, int height);
+ int x,
+ int y,
+ int width,
+ int height);
// Apply a luma/color table each ARGB pixel but preserve destination alpha.
// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
// RGB (YJ style) and C is an 8 bit color component (R, G or B).
LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBLumaColorTable(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const uint8* luma_rgb_table,
- int width, int height);
+ int width,
+ int height);
// Apply a 3 term polynomial to ARGB values.
// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
@@ -262,46 +417,80 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
// A polynomial approximation can be dirived using software such as 'R'.
LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const float* poly,
- int width, int height);
+ int width,
+ int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
// interval_offset should be a value between 0 and 255.
LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
- int scale, int interval_size, int interval_offset,
- int x, int y, int width, int height);
+int ARGBQuantize(uint8* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int x,
+ int y,
+ int width,
+ int height);
// Copy ARGB to ARGB.
LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Copy Alpha channel of ARGB to alpha of ARGB.
LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBCopyAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Extract the alpha channel from ARGB.
LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
- uint8* dst_a, int dst_stride_a,
- int width, int height);
+int ARGBExtractAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
// Copy Y channel to Alpha of ARGB.
LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBCopyYToAlpha(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width);
+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
// Get function to Alpha Blend ARGB pixels and store to destination.
LIBYUV_API
@@ -311,92 +500,143 @@ ARGBBlendRow GetARGBBlend();
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBBlend(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int BlendPlane(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
- const uint8* src_u0, int src_stride_u0,
- const uint8* src_v0, int src_stride_v0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* src_u1, int src_stride_u1,
- const uint8* src_v1, int src_stride_v1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+int I420Blend(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_u0,
+ int src_stride_u0,
+ const uint8* src_v0,
+ int src_stride_v0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* src_u1,
+ int src_stride_u1,
+ const uint8* src_v1,
+ int src_stride_v1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBMultiply(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Add ARGB image with ARGB image. Saturates to 255.
LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBAdd(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBSubtract(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert I422 to YUY2.
LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I422ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
// Convert I422 to UYVY.
LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+int I422ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_frame,
+ int dst_stride_frame,
+ int width,
+ int height);
// Convert unattentuated ARGB to preattenuated ARGB.
LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBAttenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBUnattenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height);
+int ARGBComputeCumulativeSum(const uint8* src_argb,
+ int src_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height);
// Blur ARGB image.
// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
@@ -405,49 +645,79 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
// Blur is optimized for radius of 5 (11x11) or less.
LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height, int radius);
+int ARGBBlur(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius);
// Multiply ARGB image by ARGB value.
LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, uint32 value);
+int ARGBShade(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32 value);
// Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src0 and 99% src1.
LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
- const uint8* src1, int src_stride1,
- uint8* dst, int dst_stride,
- int width, int height, int interpolation);
+int InterpolatePlane(const uint8* src0,
+ int src_stride0,
+ const uint8* src1,
+ int src_stride1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int interpolation);
+int ARGBInterpolate(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
- const uint8* src0_u, int src0_stride_u,
- const uint8* src0_v, int src0_stride_v,
- const uint8* src1_y, int src1_stride_y,
- const uint8* src1_u, int src1_stride_u,
- const uint8* src1_v, int src1_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height, int interpolation);
+int I420Interpolate(const uint8* src0_y,
+ int src0_stride_y,
+ const uint8* src0_u,
+ int src0_stride_u,
+ const uint8* src0_v,
+ int src0_stride_v,
+ const uint8* src1_y,
+ int src1_stride_y,
+ const uint8* src1_u,
+ int src1_stride_u,
+ const uint8* src1_v,
+ int src1_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
@@ -468,40 +738,59 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y,
// Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.
LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- const uint8* shuffler, int width, int height);
+int ARGBShuffle(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* shuffler,
+ int width,
+ int height);
// Sobel ARGB effect with planar output.
LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+int ARGBSobelToPlane(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
// Sobel ARGB effect.
LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBSobel(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+int ARGBSobelXY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index 8af60b89..b9f7154a 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
#define INCLUDE_LIBYUV_ROTATE_H_
#include "libyuv/basic_types.h"
@@ -20,8 +20,8 @@ extern "C" {
// Supported rotation.
typedef enum RotationMode {
- kRotate0 = 0, // No rotation.
- kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
kRotate180 = 180, // Rotate 180 degrees.
kRotate270 = 270, // Rotate 270 degrees clockwise.
@@ -33,85 +33,132 @@ typedef enum RotationMode {
// Rotate I420 frame.
LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_width, int src_height, enum RotationMode mode);
+int I420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
// Rotate NV12 input and store in I420.
LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_width, int src_height, enum RotationMode mode);
+int NV12ToI420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
// Rotate a plane by 0, 90, 180, or 270.
LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int src_width, int src_height, enum RotationMode mode);
+int RotatePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
// Rotate planes by 90, 180, 270. Deprecated.
LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
+void RotatePlane90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
+void RotatePlane180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
+void RotatePlane270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
+void RotateUV90(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
// Rotations for when U and V are interleaved.
// These functions take one input pointer and
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
+void RotateUV180(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
+void RotateUV270(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
// The 90 and 270 functions are based on transposes.
// Doing a transpose with reversing the read/write
// order will result in a rotation by +- 90 degrees.
// Deprecated.
LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
+void TransposePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
+void TransposeUV(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT
+#endif // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h
index 660ff557..be0190c1 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/files/include/libyuv/rotate_argb.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
#include "libyuv/basic_types.h"
@@ -21,13 +21,17 @@ extern "C" {
// Rotate ARGB frame
LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int src_width, int src_height, enum RotationMode mode);
+int ARGBRotate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h
index ebc487f9..2c51584e 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/files/include/libyuv/rotate_row.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
#define INCLUDE_LIBYUV_ROTATE_ROW_H_
#include "libyuv/basic_types.h"
@@ -36,7 +36,8 @@ extern "C" {
// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+ (defined(__i386__) || \
+ (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
@@ -53,69 +54,175 @@ extern "C" {
#define HAS_TRANSPOSEUVWX8_NEON
#endif
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
- defined(__mips__) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
+ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_DSPR2
#define HAS_TRANSPOSEUVWX8_DSPR2
#endif // defined(__mips__)
-void TransposeWxH_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width, int height);
-
-void TransposeWx8_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-
-void TransposeUVWxH_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
-
-void TransposeUVWx8_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+
+void TransposeWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+void TransposeWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+
+void TransposeWx8_Any_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_Any_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width);
+
+void TransposeUVWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+void TransposeUVWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_Any_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT
+#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
index 055880ba..3e5dd200 100644
--- a/files/include/libyuv/row.h
+++ b/files/include/libyuv/row.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_ROW_H_
#define INCLUDE_LIBYUV_ROW_H_
#include <stdlib.h> // For malloc.
@@ -20,21 +20,14 @@ namespace libyuv {
extern "C" {
#endif
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
-#ifdef __cplusplus
-#define align_buffer_64(var, size) \
- uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63)); \
- uint8* var = reinterpret_cast<uint8*> \
- ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
-#else
-#define align_buffer_64(var, size) \
- uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \
- uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
-#endif
+#define align_buffer_64(var, size) \
+ uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \
+ uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
#define free_aligned_buffer_64(var) \
- free(var##_mem); \
+ free(var##_mem); \
var = 0
#if defined(__pnacl__) || defined(__CLR_VER) || \
@@ -77,8 +70,8 @@ extern "C" {
#endif // __clang__
// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
- defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@@ -90,6 +83,7 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
#define HAS_ARGBSETROW_X86
#define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
@@ -104,12 +98,12 @@ extern "C" {
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
-#define HAS_ARGBEXTRACTALPHAROW_SSE2
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
@@ -180,11 +174,8 @@ extern "C" {
// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
// caveat: clangcl uses row_win.cc which works.
-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
- !defined(__i386__) || defined(_MSC_VER)
-// TODO(fbarchard): fix build error on x86 debug
-// https://code.google.com/p/libyuv/issues/detail?id=524
-#define HAS_I411TOARGBROW_SSSE3
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
// TODO(fbarchard): fix build error on android_full_debug=1
// https://code.google.com/p/libyuv/issues/detail?id=517
#define HAS_I422ALPHATOARGBROW_SSSE3
@@ -194,10 +185,12 @@ extern "C" {
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
- defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
#define HAS_ARGBCOPYALPHAROW_AVX2
#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBPOLYNOMIALROW_AVX2
#define HAS_ARGBSHUFFLEROW_AVX2
@@ -208,13 +201,9 @@ extern "C" {
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
#define HAS_I400TOARGBROW_AVX2
-#if !(defined(_DEBUG) && defined(__i386__))
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#endif
-#define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TOARGBROW_AVX2
@@ -246,6 +235,13 @@ extern "C" {
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
#endif
// The following are available for AVX2 Visual C and clangcl 32 bit:
@@ -279,6 +275,7 @@ extern "C" {
#define HAS_ARGB4444TOARGBROW_NEON
#define HAS_ARGB4444TOUVROW_NEON
#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
@@ -286,18 +283,16 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOUV411ROW_NEON
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBEXTRACTALPHAROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
+#define HAS_HALFFLOATROW_NEON
#define HAS_I400TOARGBROW_NEON
-#define HAS_I411TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
#define HAS_I422TOARGB4444ROW_NEON
@@ -360,7 +355,7 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -369,12 +364,101 @@ extern "C" {
#define HAS_MIRRORROW_DSPR2
#define HAS_MIRRORUVROW_DSPR2
#define HAS_SPLITUVROW_DSPR2
+#define HAS_RGB24TOARGBROW_DSPR2
+#define HAS_RAWTOARGBROW_DSPR2
+#define HAS_RGB565TOARGBROW_DSPR2
+#define HAS_ARGB1555TOARGBROW_DSPR2
+#define HAS_ARGB4444TOARGBROW_DSPR2
+#define HAS_I444TOARGBROW_DSPR2
+#define HAS_I422TOARGB4444ROW_DSPR2
+#define HAS_I422TOARGB1555ROW_DSPR2
+#define HAS_NV12TOARGBROW_DSPR2
+#define HAS_BGRATOUVROW_DSPR2
+#define HAS_BGRATOYROW_DSPR2
+#define HAS_ABGRTOUVROW_DSPR2
+#define HAS_ARGBTOYROW_DSPR2
+#define HAS_ABGRTOYROW_DSPR2
+#define HAS_RGBATOUVROW_DSPR2
+#define HAS_RGBATOYROW_DSPR2
+#define HAS_ARGBTOUVROW_DSPR2
#endif
#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_MERGEUVROW_MSA
+#endif
+
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define SIMD_ALIGNED32(var) __declspec(align(64)) var
+#endif
typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) int32 vec32[4];
typedef __declspec(align(16)) int8 vec8[16];
@@ -389,8 +473,11 @@ typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32];
#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
+#endif
typedef int16 __attribute__((vector_size(16))) vec16;
typedef int32 __attribute__((vector_size(16))) vec32;
typedef int8 __attribute__((vector_size(16))) vec8;
@@ -405,7 +492,6 @@ typedef uint32 __attribute__((vector_size(32))) ulvec32;
typedef uint8 __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
-#define SIMD_ALIGNED32(var) var
typedef int16 vec16[8];
typedef int32 vec32[4];
typedef int8 vec8[16];
@@ -441,34 +527,34 @@ struct YuvConstants {
#else
// This struct is for Intel color conversion.
struct YuvConstants {
- lvec8 kUVToB;
- lvec8 kUVToG;
- lvec8 kUVToR;
- lvec16 kUVBiasB;
- lvec16 kUVBiasG;
- lvec16 kUVBiasR;
- lvec16 kYToRgb;
+ int8 kUVToB[32];
+ int8 kUVToG[32];
+ int8 kUVToR[32];
+ int16 kUVBiasB[16];
+ int16 kUVBiasG[16];
+ int16 kUVBiasR[16];
+ int16 kYToRgb[16];
};
// Offsets into YuvConstants structure
-#define KUVTOB 0
-#define KUVTOG 32
-#define KUVTOR 64
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
#define KUVBIASB 96
#define KUVBIASG 128
#define KUVBIASR 160
-#define KYTORGB 192
+#define KYTORGB 192
#endif
// Conversion matrix for YUV to RGB
-extern const struct YuvConstants kYuvI601Constants; // BT.601
-extern const struct YuvConstants kYuvJPEGConstants; // JPeg color space
-extern const struct YuvConstants kYuvH709Constants; // BT.709
+extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601
+extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg
+extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709
// Conversion matrix for YVU to BGR
-extern const struct YuvConstants kYvuI601Constants; // BT.601
-extern const struct YuvConstants kYvuJPEGConstants; // JPeg color space
-extern const struct YuvConstants kYvuH709Constants; // BT.709
+extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601
+extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
+extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
@@ -490,60 +576,53 @@ extern const struct YuvConstants kYvuH709Constants; // BT.709
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) \
- #offset "(,%q" #index "," #scale ")"
+#define MEMLEA3(offset, index, scale) #offset "(,%q" #index "," #scale ")"
#define MEMLEA4(offset, base, index, scale) \
- #offset "(%q" #base ",%q" #index "," #scale ")"
+ #offset "(%q" #base ",%q" #index "," #scale ")"
#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
- #opcode " (%%r15,%%r14),%%" #reg "\n" \
- BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
- #opcode " %%" #reg ",(%%r15,%%r14)\n" \
- BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
- #opcode " (%%r15,%%r14),%" #arg "\n" \
- BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
- #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
- BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
- #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
- BUNDLEUNLOCK
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%%" #reg "\n" BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " %%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%" #arg "\n" BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
+ " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #op \
+ " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
#else // defined(__native_client__) && defined(__x86_64__)
#define NACL_R14
#define BUNDLEALIGN
#define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")"
#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) \
- #offset "(,%" #index "," #scale ")"
+#define MEMLEA3(offset, index, scale) #offset "(,%" #index "," #scale ")"
#define MEMLEA4(offset, base, index, scale) \
- #offset "(%" #base ",%" #index "," #scale ")"
+ #offset "(%" #base ",%" #index "," #scale ")"
#define MEMMOVESTRING(s, d)
#define MEMSTORESTRING(reg, d)
#define MEMOPREG(opcode, offset, base, index, scale, reg) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
- #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+ #opcode " %%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#define MEMOPARG(opcode, offset, base, index, scale, arg) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
- #reg2 "\n"
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 \
+ ",%%" #reg2 "\n"
#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
- #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+ #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
@@ -555,6 +634,57 @@ extern const struct YuvConstants kYvuH709Constants; // BT.709
#endif
#endif
+// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START \
+ ".byte 0x0F, 0x0B\n" \
+ " movl $111, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END \
+ " movl $222, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n" \
+ ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID) \
+ __asm__ __volatile__("\n\t movl $" #MARK_ID \
+ ", %%ebx" \
+ "\n\t .byte 0x64, 0x67, 0x90" \
+ : \
+ : \
+ : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+ { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+ { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
+#endif
+
+#define IACA_START \
+ { \
+ IACA_UD_BYTES \
+ IACA_SSC_MARK(111) \
+ }
+#define IACA_END \
+ { \
+ IACA_SSC_MARK(222) \
+ IACA_UD_BYTES \
+ }
+
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -580,12 +710,6 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -639,6 +763,102 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I422ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MSA(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MSA(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
@@ -653,30 +873,111 @@ void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToYRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_MSA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_MSA(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_MSA(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_MSA(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_MSA(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width);
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
@@ -685,6 +986,37 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
+void BGRAToYRow_MSA(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_MSA(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_MSA(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_MSA(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
+void BGRAToUVRow_DSPR2(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToYRow_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToUVRow_DSPR2(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToYRow_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToUVRow_DSPR2(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToYRow_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGBToUVRow_DSPR2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
@@ -710,154 +1042,400 @@ void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555,
+ uint8* dst_y,
int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
+void BGRAToYRow_Any_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
+void ARGBToYRow_Any_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
+void ABGRToYRow_Any_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444,
+ uint8* dst_y,
int width);
-
-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width);
-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void BGRAToYRow_Any_MSA(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_MSA(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_MSA(const uint8* src_rgba, uint8* dst_y, int width);
+void ARGBToYJRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
+void RGB24ToYRow_Any_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_MSA(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_Any_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_AVX2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUV444Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_Any_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_Any_MSA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_MSA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_MSA(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_MSA(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MSA(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_Any_MSA(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_Any_DSPR2(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_Any_DSPR2(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_Any_DSPR2(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_Any_DSPR2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void BGRAToUVRow_C(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ABGRToUVRow_C(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGBAToUVRow_C(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RAWToUVRow_C(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGBToUV444Row_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void ARGBToUV444Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV411Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_SSSE3(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width);
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_Any_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_Any_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width);
+void MergeUVRow_Any_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
@@ -874,25 +1452,35 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count);
void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
+void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width);
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_a,
int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_a,
+ int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_a,
int width);
void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y,
+ uint8* dst_argb,
int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
+ uint8* dst_argb,
int width);
void SetRow_C(uint8* dst, uint8 v8, int count);
@@ -906,83 +1494,173 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_MSA(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+void ARGBShuffleRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
+void ARGBShuffleRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width);
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24,
+ uint8* dst_argb,
int width);
void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565,
+ uint8* dst_argb,
int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565,
+ uint8* dst_argb,
int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24,
+ uint8* dst_argb,
int width);
+void RGB24ToARGBRow_Any_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToARGBRow_Any_MSA(const uint8* src_raw, uint8* dst_argb, int width);
void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+void RAWToRGB24Row_Any_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565,
+ uint8* dst_argb,
int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void RGB565ToARGBRow_Any_MSA(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
int width);
+void RGB24ToARGBRow_Any_DSPR2(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width);
+void RAWToARGBRow_Any_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
+void RGB565ToARGBRow_Any_DSPR2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_Any_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_Any_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
+
+void ARGB4444ToARGBRow_Any_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width);
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
@@ -990,12 +1668,18 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_C(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
@@ -1006,8 +1690,19 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
@@ -1019,10 +1714,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
@@ -1049,12 +1746,6 @@ void I422AlphaToARGBRow_C(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void NV12ToARGBRow_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@@ -1176,18 +1867,6 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I411ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void NV12ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@@ -1332,18 +2011,6 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I411ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@@ -1449,108 +2116,222 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
+void ARGBBlendRow_SSSE3(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBBlendRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBBlendRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
// Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_Any_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_Any_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
+void BlendPlaneRow_C(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width);
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
+void ARGBMultiplyRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
// ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
+void ARGBAddRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBAddRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width);
+void ARGBSubtractRow_C(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MSA(const uint8* src_argb,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
+void ARGBToRGB24Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width);
void I444ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
@@ -1571,12 +2352,6 @@ void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I422ToRGBARow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1630,175 +2405,445 @@ void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_DSPR2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB1555Row_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I411ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_DSPR2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_C(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_C(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_MSA(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
+void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width);
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width);
+ uint8* dst_yuy2,
+ int width);
void I422ToUYVYRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width);
+ uint8* dst_uyvy,
+ int width);
void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width);
+ uint8* dst_yuy2,
+ int width);
void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width);
+ uint8* dst_uyvy,
+ int width);
void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width);
+ uint8* dst_yuy2,
+ int width);
void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width);
+ uint8* dst_uyvy,
+ int width);
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width);
+ uint8* dst_yuy2,
+ int width);
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width);
+ uint8* dst_uyvy,
+ int width);
void I422ToYUY2Row_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width);
+ uint8* dst_yuy2,
+ int width);
void I422ToUYVYRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width);
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width);
+void I422ToUYVYRow_Any_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width);
// Effects related row functions.
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb,
+ uint8* dst_argb,
int width);
+void ARGBAttenuateRow_Any_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern const uint32 fixed_invtbl8[256];
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
int width);
void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBSepiaRow_C(uint8* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8* dst_argb, int width);
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width);
void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
@@ -1806,134 +2851,311 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width);
+void ARGBQuantizeRow_C(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value);
+void ARGBShadeRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value);
// Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width);
-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_C(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
+ int count);
+void ComputeCumulativeSumRow_C(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width);
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width);
// Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8* dst_ptr,
+ const uint8* src_ptr,
ptrdiff_t src_stride_ptr,
- int width, int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
int source_y_fraction);
-
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_Any_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
ptrdiff_t src_stride_ptr,
- int width, int source_y_fraction);
+ int width,
+ int source_y_fraction);
// Sobel images.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
- uint8* dst_sobelx, int width);
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width);
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width);
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width);
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width);
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width);
+void SobelXRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width);
+void SobelYRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width);
+void SobelRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelXYRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelToPlaneRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
+void SobelXYRow_Any_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width);
void ARGBPolynomialRow_C(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
+ uint8* dst_argb,
+ const float* poly,
int width);
void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
+ uint8* dst_argb,
+ const float* poly,
int width);
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
+ uint8* dst_argb,
+ const float* poly,
int width);
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- const uint8* luma, uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_SSE2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_AVX2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_NEON(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_NEON(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
int width,
- const uint8* luma, uint32 lumacoeff);
+ const uint8* luma,
+ uint32 lumacoeff);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT
+#endif // INCLUDE_LIBYUV_ROW_H_
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
index 102158d1..6d6b9a85 100644
--- a/files/include/libyuv/scale.h
+++ b/files/include/libyuv/scale.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_H_
#define INCLUDE_LIBYUV_SCALE_H_
#include "libyuv/basic_types.h"
@@ -20,25 +20,33 @@ extern "C" {
// Supported filtering.
typedef enum FilterMode {
- kFilterNone = 0, // Point sample; Fastest.
- kFilterLinear = 1, // Filter horizontally only.
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterLinear = 1, // Filter horizontally only.
kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
- kFilterBox = 3 // Highest quality.
+ kFilterBox = 3 // Highest quality.
} FilterModeEnum;
// Scale a YUV plane.
LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
+void ScalePlane(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
- int src_width, int src_height,
- uint16* dst, int dst_stride,
- int dst_width, int dst_height,
+void ScalePlane_16(const uint16* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
@@ -52,42 +60,73 @@ void ScalePlane_16(const uint16* src, int src_stride,
// Returns 0 if successful.
LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
+int I420Scale(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
- const uint16* src_u, int src_stride_u,
- const uint16* src_v, int src_stride_v,
- int src_width, int src_height,
- uint16* dst_y, int dst_stride_y,
- uint16* dst_u, int dst_stride_u,
- uint16* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
+int I420Scale_16(const uint16* src_y,
+ int src_stride_y,
+ const uint16* src_u,
+ int src_stride_u,
+ const uint16* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16* dst_y,
+ int dst_stride_y,
+ uint16* dst_u,
+ int dst_stride_u,
+ uint16* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
- int src_stride_y, int src_stride_u, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, uint8* dst_u, uint8* dst_v,
- int dst_stride_y, int dst_stride_u, int dst_stride_v,
- int dst_width, int dst_height,
+int Scale(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ uint8* dst_u,
+ uint8* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
LIBYUV_BOOL interpolate);
// Legacy API. Deprecated.
LIBYUV_API
-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
- uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+int ScaleOffset(const uint8* src_i420,
+ int src_width,
+ int src_height,
+ uint8* dst_i420,
+ int dst_width,
+ int dst_height,
+ int dst_yoffset,
LIBYUV_BOOL interpolate);
// For testing, allow disabling of specialized scalers.
@@ -100,4 +139,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use);
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h
index b56cf520..3d25e579 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/files/include/libyuv/scale_argb.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
#define INCLUDE_LIBYUV_SCALE_ARGB_H_
#include "libyuv/basic_types.h"
@@ -20,32 +20,52 @@ extern "C" {
#endif
LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
+int ARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
// Clipped scale takes destination rectangle coordinates for clip values.
LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering);
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
+int YUVToARGBScaleClip(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
uint32 src_fourcc,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
uint32 dst_fourcc,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering);
#ifdef __cplusplus
@@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h
index df699e6c..edb46cc8 100644
--- a/files/include/libyuv/scale_row.h
+++ b/files/include/libyuv/scale_row.h
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
#define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h"
@@ -45,8 +45,8 @@ extern "C" {
#endif // __clang__
// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
- defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
@@ -72,8 +72,9 @@ extern "C" {
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
- defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
@@ -94,32 +95,56 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_DSPR2
#define HAS_SCALEROWDOWN4_DSPR2
#define HAS_SCALEROWDOWN34_DSPR2
#define HAS_SCALEROWDOWN38_DSPR2
+#define HAS_SCALEADDROW_DSPR2
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN4_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEADDROW_MSA
#endif
// Scale ARGB vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int y, int dy,
- int bpp, enum FilterMode filtering);
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering);
void ScalePlaneVertical_16(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_argb, uint16* dst_argb,
- int x, int y, int dy,
- int wpp, enum FilterMode filtering);
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_argb,
+ uint16* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering);
// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
- int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering);
// Divide num by div and return as 16.16 fixed point result.
@@ -137,367 +162,768 @@ int FixedDiv1_X86(int num, int div);
#endif
// Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
- int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering,
- int* x, int* y, int* dx, int* dy);
-
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width);
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx);
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int, int);
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx);
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width);
+ int* x,
+ int* y,
+ int* dx,
+ int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width);
+void ScaleCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleColsUp2_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleFilterCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown38_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width);
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
+ uint16* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width);
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int, int);
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int,
+ int);
+void ScaleARGBFilterCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-
+void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
// ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx);
+void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx);
// ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width);
+ uint8* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ uint8* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
// 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx);
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width);
+void ScaleRowDown38_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width);
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_DSPR2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width);
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT
+#endif // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
index ca0c062e..dccc479b 100644
--- a/files/include/libyuv/version.h
+++ b/files/include/libyuv/version.h
@@ -8,9 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1602
+#define LIBYUV_VERSION 1645
-#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
+#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h
index ad934e42..f3711c42 100644
--- a/files/include/libyuv/video_common.h
+++ b/files/include/libyuv/video_common.h
@@ -10,7 +10,7 @@
// Common definitions for video, including fourcc and VideoFormat.
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
#include "libyuv/basic_types.h"
@@ -28,13 +28,13 @@ extern "C" {
// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
// constants are used in a switch.
#ifdef __cplusplus
-#define FOURCC(a, b, c, d) ( \
- (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
- (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#define FOURCC(a, b, c, d) \
+ ((static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+ (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
#else
-#define FOURCC(a, b, c, d) ( \
- ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
- ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
+#define FOURCC(a, b, c, d) \
+ (((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+ ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
#endif
// Some pages discussing FourCC codes:
@@ -49,18 +49,18 @@ extern "C" {
// Secondary formats are converted in 2 steps.
// Auxilliary formats call primary converters.
enum FourCC {
- // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
- FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated.
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
- // 2 Secondary YUV formats: row biplanar.
+ // 1 Secondary YUV format: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
@@ -69,7 +69,7 @@ enum FourCC {
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
- FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
@@ -137,7 +137,7 @@ enum FourCCBpp {
FOURCC_BPP_ABGR = 32,
FOURCC_BPP_RGBA = 32,
FOURCC_BPP_24BG = 24,
- FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_RAW = 24,
FOURCC_BPP_RGBP = 16,
FOURCC_BPP_RGBO = 16,
FOURCC_BPP_R444 = 16,
@@ -170,7 +170,7 @@ enum FourCCBpp {
FOURCC_BPP_CM24 = 24,
// Match any fourcc.
- FOURCC_BPP_ANY = 0, // 0 means unknown.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
};
// Converts fourcc aliases into canonical ones.
@@ -181,4 +181,4 @@ LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/files/infra/config/OWNERS b/files/infra/config/OWNERS
new file mode 100644
index 00000000..02eccd5e
--- /dev/null
+++ b/files/infra/config/OWNERS
@@ -0,0 +1,3 @@
+set noparent
+agable@chromium.org
+kjellander@chromium.org
diff --git a/files/infra/config/README.md b/files/infra/config/README.md
new file mode 100644
index 00000000..c036d610
--- /dev/null
+++ b/files/infra/config/README.md
@@ -0,0 +1 @@
+This directory contains configuration files for infra services.
diff --git a/files/infra/config/cq.cfg b/files/infra/config/cq.cfg
new file mode 100644
index 00000000..7a0d2d84
--- /dev/null
+++ b/files/infra/config/cq.cfg
@@ -0,0 +1,61 @@
+# Commit Queue configuration file. The documentation of the format can be found
+# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
+
+version: 1
+cq_name: "libyuv"
+cq_status_url: "https://chromium-cq-status.appspot.com"
+git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
+
+gerrit {}
+rietveld {
+ url: "https://codereview.chromium.org"
+}
+
+
+verifiers {
+ reviewer_lgtm {
+ committer_list: "project-libyuv-committers"
+ }
+
+ try_job {
+ buckets {
+ name: "master.tryserver.libyuv"
+ builders { name: "win" }
+ builders { name: "win_rel" }
+ builders { name: "win_x64_rel" }
+ builders { name: "win_clang" }
+ builders { name: "win_clang_rel" }
+ builders { name: "win_x64_clang_rel" }
+ builders { name: "mac" }
+ builders { name: "mac_rel" }
+ builders { name: "mac_asan" }
+ builders { name: "ios" }
+ builders { name: "ios_rel" }
+ builders { name: "ios_arm64" }
+ builders { name: "ios_arm64_rel" }
+ builders { name: "linux" }
+ builders { name: "linux_rel" }
+ builders {
+ name: "linux_gcc"
+ experiment_percentage: 100
+ }
+ builders { name: "linux_memcheck" }
+ builders { name: "linux_msan" }
+ builders { name: "linux_tsan2" }
+ builders { name: "linux_asan" }
+ builders { name: "linux_msan" }
+ builders { name: "linux_ubsan" }
+ builders { name: "linux_ubsan_vptr" }
+ builders { name: "android" }
+ builders { name: "android_rel" }
+ builders { name: "android_clang" }
+ builders { name: "android_arm64" }
+ builders { name: "android_x86" }
+ builders { name: "android_x64" }
+ builders {
+ name: "android_mips"
+ experiment_percentage: 100
+ }
+ }
+ }
+}
diff --git a/files/libyuv.gni b/files/libyuv.gni
new file mode 100644
index 00000000..89e4d382
--- /dev/null
+++ b/files/libyuv.gni
@@ -0,0 +1,20 @@
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("//build_overrides/build.gni")
+import("//build/config/arm.gni")
+import("//build/config/mips.gni")
+
+declare_args() {
+ libyuv_include_tests = !build_with_chromium
+ libyuv_disable_jpeg = false
+ libyuv_use_neon = (current_cpu == "arm64" ||
+ (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
+ libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
+ mips_use_msa
+}
diff --git a/files/libyuv.gyp b/files/libyuv.gyp
index db4b5490..f73a1a4b 100644
--- a/files/libyuv.gyp
+++ b/files/libyuv.gyp
@@ -18,21 +18,28 @@
},
'variables': {
'use_system_libjpeg%': 0,
- 'libyuv_disable_jpeg%': 0,
+ # Can be enabled if your jpeg has GYP support.
+ 'libyuv_disable_jpeg%': 1,
# 'chromium_code' treats libyuv as internal and increases warning level.
'chromium_code': 1,
# clang compiler default variable usable by other apps that include libyuv.
'clang%': 0,
# Link-Time Optimizations.
'use_lto%': 0,
+ 'mips_msa%': 0, # Default to msa off.
'build_neon': 0,
+ 'build_msa': 0,
'conditions': [
['(target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
- and (arm_neon == 1 or arm_neon_optional == 1)',
- {
+ and (arm_neon == 1 or arm_neon_optional == 1)', {
'build_neon': 1,
}],
+ ['(target_arch == "mipsel" or target_arch == "mips64el")\
+ and (mips_msa == 1)',
+ {
+ 'build_msa': 1,
+ }],
],
},
@@ -80,6 +87,11 @@
}],
],
}],
+ ['build_msa != 0', {
+ 'defines': [
+ 'LIBYUV_MSA',
+ ],
+ }],
['OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [
'HAVE_JPEG'
@@ -109,7 +121,7 @@
# Enable the following 3 macros to turn off assembly for specified CPU.
# 'LIBYUV_DISABLE_X86',
# 'LIBYUV_DISABLE_NEON',
- # 'LIBYUV_DISABLE_MIPS',
+ # 'LIBYUV_DISABLE_DSPR2',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
# TODO(fbarchard): Make these into gyp defines.
diff --git a/files/libyuv.gypi b/files/libyuv.gypi
index 73fdec0a..18b2feca 100644
--- a/files/libyuv.gypi
+++ b/files/libyuv.gypi
@@ -18,6 +18,7 @@
'include/libyuv/convert_from.h',
'include/libyuv/convert_from_argb.h',
'include/libyuv/cpu_id.h',
+ 'include/libyuv/macros_msa.h',
'include/libyuv/mjpeg_decoder.h',
'include/libyuv/planar_functions.h',
'include/libyuv/rotate.h',
@@ -53,14 +54,16 @@
'source/rotate_argb.cc',
'source/rotate_common.cc',
'source/rotate_gcc.cc',
- 'source/rotate_mips.cc',
+ 'source/rotate_dspr2.cc',
+ 'source/rotate_msa.cc',
'source/rotate_neon.cc',
'source/rotate_neon64.cc',
'source/rotate_win.cc',
'source/row_any.cc',
'source/row_common.cc',
'source/row_gcc.cc',
- 'source/row_mips.cc',
+ 'source/row_dspr2.cc',
+ 'source/row_msa.cc',
'source/row_neon.cc',
'source/row_neon64.cc',
'source/row_win.cc',
@@ -69,7 +72,8 @@
'source/scale_argb.cc',
'source/scale_common.cc',
'source/scale_gcc.cc',
- 'source/scale_mips.cc',
+ 'source/scale_dspr2.cc',
+ 'source/scale_msa.cc',
'source/scale_neon.cc',
'source/scale_neon64.cc',
'source/scale_win.cc',
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
index 27b330f6..88860f5c 100644
--- a/files/libyuv_test.gyp
+++ b/files/libyuv_test.gyp
@@ -8,7 +8,9 @@
{
'variables': {
- 'libyuv_disable_jpeg%': 0,
+ # Can be enabled if your jpeg has GYP support.
+ 'libyuv_disable_jpeg%': 1,
+ 'mips_msa%': 0, # Default to msa off.
},
'targets': [
{
@@ -52,11 +54,6 @@
'-fexceptions',
],
}],
- [ 'OS == "ios" and target_subarch == 64', {
- 'defines': [
- 'LIBYUV_DISABLE_NEON'
- ],
- }],
[ 'OS == "ios"', {
'xcode_settings': {
'DEBUGGING_SYMBOLS': 'YES',
@@ -91,12 +88,18 @@
'LIBYUV_NEON'
],
}],
+ [ '(target_arch == "mipsel" or target_arch == "mips64el") \
+ and (mips_msa == 1)', {
+ 'defines': [
+ 'LIBYUV_MSA'
+ ],
+ }],
], # conditions
'defines': [
# Enable the following 3 macros to turn off assembly for specified CPU.
# 'LIBYUV_DISABLE_X86',
# 'LIBYUV_DISABLE_NEON',
- # 'LIBYUV_DISABLE_MIPS',
+ # 'LIBYUV_DISABLE_DSPR2',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
],
@@ -151,12 +154,6 @@
'libyuv.gyp:libyuv',
],
'conditions': [
- [ 'OS == "ios" and target_subarch == 64', {
- 'defines': [
- 'LIBYUV_DISABLE_NEON'
- ],
- }],
-
[ 'OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [
'HAVE_JPEG',
@@ -181,40 +178,16 @@
['OS=="android"', {
'targets': [
{
- # TODO(kjellander): Figure out what to change in build/apk_test.gypi
- # to it can be used instead of the copied code below. Using it in its
- # current version was not possible, since the target starts with 'lib',
- # which somewhere confuses the variables.
- 'target_name': 'libyuv_unittest_apk',
+ 'target_name': 'yuv_unittest_apk',
'type': 'none',
'variables': {
- # These are used to configure java_apk.gypi included below.
- 'test_type': 'gtest',
- 'apk_name': 'libyuv_unittest',
- 'test_suite_name': 'libyuv_unittest',
- 'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
- 'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
- 'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
- 'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
- 'test_runner_path': '<(DEPTH)/util/android/test_runner.py',
- 'native_lib_target': 'libyuv_unittest',
- 'gyp_managed_install': 0,
+ 'test_suite_name': 'yuv_unittest',
+ 'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
},
'includes': [
- 'build/android/test_runner.gypi',
- 'build/java_apk.gypi',
- ],
+ 'build/apk_test.gypi',
+ ],
'dependencies': [
- '<(DEPTH)/base/base.gyp:base_java',
- # TODO(kjellander): Figure out why base_build_config_gen is needed
- # here. It really shouldn't since it's a dependency of base_java
- # above, but there's always 0 tests run if it's missing.
- '<(DEPTH)/base/base.gyp:base_build_config_gen',
- '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
- '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
- '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
- '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
- '<(DEPTH)/tools/android/android_tools.gyp:android_tools',
'libyuv_unittest',
],
},
diff --git a/files/linux.mk b/files/linux.mk
index ee5a3a70..923345ae 100644
--- a/files/linux.mk
+++ b/files/linux.mk
@@ -32,14 +32,14 @@ LOCAL_OBJ_FILES := \
source/rotate.o \
source/rotate_common.o \
source/rotate_gcc.o \
- source/rotate_mips.o \
+ source/rotate_dspr2.o \
source/rotate_neon64.o \
source/rotate_neon.o \
source/rotate_win.o \
source/row_any.o \
source/row_common.o \
source/row_gcc.o \
- source/row_mips.o \
+ source/row_dspr2.o \
source/row_neon64.o \
source/row_neon.o \
source/row_win.o \
@@ -48,7 +48,7 @@ LOCAL_OBJ_FILES := \
source/scale.o \
source/scale_common.o \
source/scale_gcc.o \
- source/scale_mips.o \
+ source/scale_dspr2.o \
source/scale_neon64.o \
source/scale_neon.o \
source/scale_win.o \
@@ -74,6 +74,8 @@ psnr: util/psnr.cc
$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
# A C test utility that uses libyuv conversion from C.
+# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
+# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
cpuid: util/cpuid.c libyuv.a
$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
diff --git a/files/pylintrc b/files/pylintrc
new file mode 100644
index 00000000..b8bea334
--- /dev/null
+++ b/files/pylintrc
@@ -0,0 +1,17 @@
+[MESSAGES CONTROL]
+
+# Disable the message, report, category or checker with the given id(s).
+# TODO(kjellander): Reduce this list to as small as possible.
+disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements
+
+
+[REPORTS]
+
+# Don't write out full reports, just messages.
+reports=no
+
+
+[FORMAT]
+
+# We use two spaces for indents, instead of the usual four spaces or tab.
+indent-string=' '
diff --git a/files/source/compare.cc b/files/source/compare.cc
index e3846bdf..1facd27b 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -32,8 +32,7 @@ LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
- uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
- HashDjb2_C;
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@@ -50,13 +49,13 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
src += kBlockSize;
count -= kBlockSize;
}
- remainder = (int)(count) & ~15;
+ remainder = (int)count & ~15;
if (remainder) {
seed = HashDjb2_SSE(src, remainder, seed);
src += remainder;
count -= remainder;
}
- remainder = (int)(count) & 15;
+ remainder = (int)count & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
@@ -113,7 +112,8 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+uint64 ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b,
int count) {
// SumSquareError returns values 0 to 65535 for each squared difference.
// Up to 65536 of those can be summed and remain within a uint32.
@@ -142,7 +142,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@@ -162,14 +162,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
uint64 sse = 0;
int h;
// Coalesce rows.
- if (stride_a == width &&
- stride_b == width) {
+ if (stride_a == width && stride_b == width) {
width *= height;
height = 1;
stride_a = stride_b = 0;
@@ -186,10 +188,10 @@ LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
double psnr;
if (sse > 0) {
- double mse = (double)(count) / (double)(sse);
+ double mse = (double)count / (double)sse;
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
- psnr = kMaxPsnr; // Limit to prevent divide by 0
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
}
if (psnr > kMaxPsnr)
@@ -199,45 +201,53 @@ double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
}
LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
+double CalcFramePsnr(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
const uint64 samples = width * height;
- const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
- src_b, stride_b,
- width, height);
+ const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+ stride_b, width, height);
return SumSquareErrorToPsnr(sse, samples);
}
LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height) {
- const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
- src_y_b, stride_y_b,
- width, height);
+double I420Psnr(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
+ stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
- const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
- src_u_b, stride_u_b,
- width_uv, height_uv);
- const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
- src_v_b, stride_v_b,
- width_uv, height_uv);
+ const uint64 sse_u = ComputeSumSquareErrorPlane(
+ src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+ const uint64 sse_v = ComputeSumSquareErrorPlane(
+ src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
const uint64 samples = width * height + 2 * (width_uv * height_uv);
const uint64 sse = sse_y + sse_u + sse_v;
return SumSquareErrorToPsnr(sse, samples);
}
-static const int64 cc1 = 26634; // (64^2*(.01*255)^2
+static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc2 = 239708; // (64^2*(.03*255)^2
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b) {
+static double Ssim8x8_C(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b) {
int64 sum_a = 0;
int64 sum_b = 0;
int64 sum_sq_a = 0;
@@ -270,12 +280,12 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
- const int64 sum_a_sq = sum_a*sum_a;
- const int64 sum_b_sq = sum_b*sum_b;
+ const int64 sum_a_sq = sum_a * sum_a;
+ const int64 sum_b_sq = sum_b * sum_b;
- const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
- (count * sum_sq_a - sum_a_sq +
- count * sum_sq_b - sum_b_sq + c2);
+ const int64 ssim_d =
+ (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
if (ssim_d == 0.0) {
return DBL_MAX;
@@ -288,13 +298,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b,
- int width, int height) {
+double CalcFrameSsim(const uint8* src_a,
+ int stride_a,
+ const uint8* src_b,
+ int stride_b,
+ int width,
+ int height) {
int samples = 0;
double ssim_total = 0;
- double (*Ssim8x8)(const uint8* src_a, int stride_a,
- const uint8* src_b, int stride_b) = Ssim8x8_C;
+ double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
+ int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
int i;
@@ -314,22 +327,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
}
LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
- const uint8* src_u_a, int stride_u_a,
- const uint8* src_v_a, int stride_v_a,
- const uint8* src_y_b, int stride_y_b,
- const uint8* src_u_b, int stride_u_b,
- const uint8* src_v_b, int stride_v_b,
- int width, int height) {
- const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
- src_y_b, stride_y_b, width, height);
+double I420Ssim(const uint8* src_y_a,
+ int stride_y_a,
+ const uint8* src_u_a,
+ int stride_u_a,
+ const uint8* src_v_a,
+ int stride_v_a,
+ const uint8* src_y_b,
+ int stride_y_b,
+ const uint8* src_u_b,
+ int stride_u_b,
+ const uint8* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const double ssim_y =
+ CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
- const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
- src_u_b, stride_u_b,
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
width_uv, height_uv);
- const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
- src_v_b, stride_v_b,
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
width_uv, height_uv);
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
}
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
index 1b83edb1..64522aaa 100644
--- a/files/source/compare_gcc.cc
+++ b/files/source/compare_gcc.cc
@@ -62,30 +62,30 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
static uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
};
static uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
};
static uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
};
static uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
};
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
@@ -148,4 +148,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
index dc86fe25..b17fc8e1 100644
--- a/files/source/compare_win.cc
+++ b/files/source/compare_win.cc
@@ -21,12 +21,12 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+__declspec(naked) uint32
+ SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
@@ -61,13 +61,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+#pragma warning(disable : 4752)
+__declspec(naked) uint32
+ SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
@@ -101,65 +101,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
}
#endif // _MSC_VER >= 1700
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
};
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32
+ HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
- pxor xmm7, xmm7 // constant 0 for unpck
+ pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
wloop:
- movdqu xmm1, [eax] // src[0-15]
+ movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
- pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1
- punpcklbw xmm2, xmm7 // src[0-7]
+ punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
- punpcklwd xmm3, xmm7 // src[0-3]
+ punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2
- punpckhwd xmm4, xmm7 // src[4-7]
+ punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
- punpckhbw xmm1, xmm7 // src[8-15]
+ punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
- punpcklwd xmm2, xmm7 // src[8-11]
+ punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
- punpckhwd xmm1, xmm7 // src[12-15]
+ punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
- paddd xmm3, xmm4 // add 16 results
+ paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
@@ -171,18 +171,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
sub ecx, 16
jg wloop
- movd eax, xmm0 // return hash
+ movd eax, xmm0 // return hash
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32
+ HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
wloop:
@@ -196,7 +196,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
- vpaddd xmm3, xmm3, xmm4 // add 16 results
+ vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
@@ -207,7 +207,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
sub ecx, 16
jg wloop
- vmovd eax, xmm0 // return hash
+ vmovd eax, xmm0 // return hash
vzeroupper
ret
}
diff --git a/files/source/convert.cc b/files/source/convert.cc
index e332bc50..f79acaca 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -28,31 +28,37 @@ static __inline int Abs(int v) {
}
// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_y_width, int src_y_height,
- int src_uv_width, int src_uv_height) {
+static int I4xxToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int src_uv_width,
+ int src_uv_height) {
const int dst_y_width = Abs(src_y_width);
const int dst_y_height = Abs(src_y_height);
const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
- if (src_y_width == 0 || src_y_height == 0 ||
- src_uv_width == 0 || src_uv_height == 0) {
+ if (src_uv_width == 0 || src_uv_height == 0) {
return -1;
}
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
- dst_y, dst_stride_y, dst_y_width, dst_y_height,
- kFilterBilinear);
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
- dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
- dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
- kFilterBilinear);
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
return 0;
}
@@ -60,18 +66,23 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y,
// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
// is does row coalescing.
LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I420Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -98,76 +109,63 @@ int I420Copy(const uint8* src_y, int src_stride_y,
// 422 chroma is 1/2 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I422ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
const int src_uv_width = SUBSAMPLE(width, 1, 1);
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- src_uv_width, height);
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, src_uv_width, height);
}
// 444 chroma is 1x width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- width, height);
-}
-
-// 411 chroma is 1/4 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int src_uv_width = SUBSAMPLE(width, 3, 2);
- return I4xxToI420(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- src_uv_width, height);
+int I444ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, width, height);
}
// I400 is greyscale typically used in MJPG
LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I400ToI420(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -177,15 +175,21 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
return 0;
}
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
- uint8* dst, int dst_stride,
- int width, int height) {
+static void CopyPlane2(const uint8* src,
+ int src_stride_0,
+ int src_stride_1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_SSE2)
@@ -236,27 +240,30 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
// The UV plane is half width, but 2 values, so src_stride_m420 applies to
// this as well as the two Y planes.
static int X420ToI420(const uint8* src_y,
- int src_stride_y0, int src_stride_y1,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
+ int src_stride_y0,
+ int src_stride_y1,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) = SplitUVRow_C;
- if (!src_y || !src_uv ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
+ if (dst_y) {
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ }
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
@@ -264,56 +271,19 @@ static int X420ToI420(const uint8* src_y,
dst_stride_v = -dst_stride_v;
}
// Coalesce rows.
- if (src_stride_y0 == width &&
- src_stride_y1 == width &&
+ if (src_stride_y0 == width && src_stride_y1 == width &&
dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
}
// Coalesce rows.
- if (src_stride_uv == halfwidth * 2 &&
- dst_stride_u == halfwidth &&
+ if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
dst_stride_v == halfwidth) {
halfwidth *= halfheight;
halfheight = 1;
src_stride_uv = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_SPLITUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- SplitUVRow = SplitUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- SplitUVRow = SplitUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- SplitUVRow = SplitUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SplitUVRow = SplitUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_NEON;
- }
- }
-#endif
-#if defined(HAS_SPLITUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
- IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
- IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_Any_DSPR2;
- if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_DSPR2;
- }
- }
-#endif
if (dst_y) {
if (src_stride_y0 == src_stride_y1) {
@@ -324,75 +294,86 @@ static int X420ToI420(const uint8* src_y,
}
}
- for (y = 0; y < halfheight; ++y) {
- // Copy a row of UV.
- SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- src_uv += src_stride_uv;
- }
+ // Split UV plane - NV12 / NV21
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+
return 0;
}
// Convert NV12 to I420.
LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y,
- src_uv, src_stride_uv,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
+int NV12ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
}
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_vu, int src_stride_vu,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y,
- src_vu, src_stride_vu,
- dst_y, dst_stride_y,
- dst_v, dst_stride_v,
- dst_u, dst_stride_u,
- width, height);
+int NV21ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_vu,
+ int src_stride_vu,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+ dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+ dst_stride_u, width, height);
}
// Convert M420 to I420.
LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int M420ToI420(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
+ src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
width, height);
}
// Convert YUY2 to I420.
LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int YUY2ToI420(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int width) = YUY2ToYRow_C;
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u,
+ uint8* dst_v, int width) = YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -429,6 +410,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -448,16 +439,21 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
// Convert UYVY to I420.
LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int UYVYToI420(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int width) = UYVYToYRow_C;
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u,
+ uint8* dst_v, int width) = UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -494,6 +490,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUVRow = UYVYToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUVRow = UYVYToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -513,19 +519,22 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
// Convert ARGB to I420.
LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGBToI420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
- if (!src_argb ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -570,6 +579,38 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -589,19 +630,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
// Convert BGRA to I420.
LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int BGRAToI420(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u,
+ uint8* dst_v, int width) = BGRAToUVRow_C;
void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
BGRAToYRow_C;
- if (!src_bgra ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -629,12 +673,44 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_BGRATOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- BGRAToUVRow = BGRAToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_NEON;
- }
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
}
+ }
+#endif
+#if defined(HAS_BGRATOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ BGRAToYRow = BGRAToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ BGRAToUVRow = BGRAToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToUVRow = BGRAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_MSA;
+ }
+ }
#endif
for (y = 0; y < height - 1; y += 2) {
@@ -655,19 +731,22 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
// Convert ABGR to I420.
LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ABGRToI420(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u,
+ uint8* dst_v, int width) = ABGRToUVRow_C;
void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
ABGRToYRow_C;
- if (!src_abgr ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -702,6 +781,38 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
}
}
#endif
+#if defined(HAS_ABGRTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ABGRToYRow = ABGRToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -721,19 +832,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
// Convert RGBA to I420.
LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int RGBAToI420(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u,
+ uint8* dst_v, int width) = RGBAToUVRow_C;
void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
RGBAToYRow_C;
- if (!src_rgba ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -768,6 +882,38 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
}
}
#endif
+#if defined(HAS_RGBATOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGBAToYRow = RGBAToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGBAToUVRow = RGBAToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToUVRow = RGBAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -787,27 +933,31 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
// Convert RGB24 to I420.
LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int RGB24ToI420(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
RGB24ToYRow_C;
#else
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB24ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
#endif
- if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -829,6 +979,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
}
}
+#elif defined(HAS_RGB24TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+ RGB24ToYRow = RGB24ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_MSA;
+ RGB24ToUVRow = RGB24ToUVRow_MSA;
+ }
+ }
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -865,63 +1024,67 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
align_buffer_64(row, kRowSize * 2);
#endif
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
- RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
-#if !defined(HAS_RGB24TOYROW_NEON)
- free_aligned_buffer_64(row);
}
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+ free_aligned_buffer_64(row);
+}
#endif
- return 0;
+return 0;
}
// Convert RAW to I420.
LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int RAWToI420(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
-#if defined(HAS_RAWTOYROW_NEON)
- void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u,
+ uint8* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
RAWToYRow_C;
#else
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RAWToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
#endif
- if (!src_raw || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -943,99 +1106,121 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
}
}
+#elif defined(HAS_RAWTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVRow = RAWToUVRow_Any_MSA;
+ RAWToYRow = RAWToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_MSA;
+ RAWToUVRow = RAWToUVRow_MSA;
+ }
+ }
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
}
- }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
}
- }
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
}
- }
#endif
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToARGBRow = RAWToARGBRow_DSPR2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
#else
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
#else
- RAWToARGBRow(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
-#if !defined(HAS_RAWTOYROW_NEON)
- free_aligned_buffer_64(row);
}
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+ free_aligned_buffer_64(row);
+}
#endif
- return 0;
+return 0;
}
// Convert RGB565 to I420.
LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int RGB565ToI420(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int width) =
+ RGB565ToUVRow_C;
void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
RGB565ToYRow_C;
#else
void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB565ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
#endif
- if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1057,107 +1242,130 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
}
}
+#elif defined(HAS_RGB565TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+ RGB565ToYRow = RGB565ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_MSA;
+ RGB565ToUVRow = RGB565ToUVRow_MSA;
+ }
+ }
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
- }
- }
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
#endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
- }
- }
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
#endif
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
- RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
#else
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
#else
- RGB565ToARGBRow(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
-#if !defined(HAS_RGB565TOYROW_NEON)
- free_aligned_buffer_64(row);
}
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+ free_aligned_buffer_64(row);
+}
#endif
- return 0;
+return 0;
}
// Convert ARGB1555 to I420.
LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGB1555ToI420(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int width) =
+ ARGB1555ToUVRow_C;
void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
ARGB1555ToYRow_C;
#else
void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
ARGB1555ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
#endif
- if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1179,109 +1387,124 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
}
}
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+ }
+ }
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
- }
- }
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
#endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
- }
- }
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
- {
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
#endif
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
- ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
- width);
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
#endif
- }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
- free_aligned_buffer_64(row);
}
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+ free_aligned_buffer_64(row);
+}
#endif
- return 0;
+return 0;
}
// Convert ARGB4444 to I420.
LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGB4444ToI420(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
#if defined(HAS_ARGB4444TOYROW_NEON)
void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int width) =
+ ARGB4444ToUVRow_C;
void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
ARGB4444ToYRow_C;
#else
void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
ARGB4444ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
#endif
- if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1321,6 +1544,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
}
#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1341,18 +1572,30 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+ }
+#endif
{
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
- for (y = 0; y < height - 1; y += 2) {
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
- ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
- width);
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
#else
ARGB4444ToARGBRow(src_argb4444, row, width);
ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
@@ -1361,25 +1604,107 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
#else
ARGB4444ToARGBRow(src_argb4444, row, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
#endif
- }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
- free_aligned_buffer_64(row);
}
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ free_aligned_buffer_64(row);
+}
#endif
+return 0;
+}
+
+static void SplitPixels(const uint8* src_u,
+ int src_pixel_stride_uv,
+ uint8* dst_u,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst_u = *src_u;
+ ++dst_u;
+ src_u += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Copy UV planes as is - I420
+ if (src_pixel_stride_uv == 1) {
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV21
+ } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV12
+ } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
+ src_stride_u == src_stride_v) {
+ SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ }
+
+ for (y = 0; y < halfheight; ++y) {
+ SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+ SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
return 0;
}
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index fb9582d6..5007bdb9 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -26,11 +26,13 @@ extern "C" {
// Copy ARGB with optional flipping
LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (!src_argb || !dst_argb ||
- width <= 0 || height == 0) {
+int ARGBCopy(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
- CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
- width * 4, height);
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+ height);
return 0;
}
// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+static int I420ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -102,6 +106,14 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -117,111 +129,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
// Convert I420 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
+int I420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
}
// Convert I420 to ABGR.
LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int I420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J420 to ARGB.
LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
+int J420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
}
// Convert J420 to ABGR.
LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int J420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H420 to ARGB.
LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvH709Constants,
- width, height);
+int H420ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
}
// Convert H420 to ABGR.
LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int H420ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+static int I422ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -231,10 +262,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_argb == width * 4) {
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -272,6 +301,14 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -285,111 +322,130 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
// Convert I422 to ARGB.
LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
+int I422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
}
// Convert I422 to ABGR.
LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int I422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J422 to ARGB.
LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
+int J422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
}
// Convert J422 to ABGR.
LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int J422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H422 to ARGB.
LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvH709Constants,
- width, height);
+int H422ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
}
// Convert H422 to ABGR.
LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int H422ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+static int I444ToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I444ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I444ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
+ void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -399,9 +455,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u == width &&
- src_stride_v == width &&
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
@@ -431,6 +485,22 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I444TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -444,138 +514,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
// Convert I444 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
+int I444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
}
// Convert I444 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
+int I444ToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J444 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
-}
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- int y;
- void (*I411ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I411ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 4 == width &&
- src_stride_v * 4 == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I411TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I411TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I411ToARGBRow = I411ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I411ToARGBRow = I411ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I411TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I411ToARGBRow = I411ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
+int J444ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
}
// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
+static int I420AlphaToARGBMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
const struct YuvConstants* yuvconstants,
- int width, int height, int attenuate) {
+ int width,
+ int height,
+ int attenuate) {
int y;
- void (*I422AlphaToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
+ void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) = I422AlphaToARGBRow_C;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBAttenuateRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -617,6 +630,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
}
#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -641,6 +662,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -661,49 +690,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
// Convert I420 with Alpha to ARGB.
LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int attenuate) {
- return I420AlphaToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- src_a, src_stride_a,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height, attenuate);
+int I420AlphaToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
}
// Convert I420 with Alpha to ABGR.
LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height, int attenuate) {
- return I420AlphaToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- src_a, src_stride_a,
- dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height, attenuate);
+int I420AlphaToABGR(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ const uint8* src_a,
+ int src_stride_a,
+ uint8* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
}
// Convert I400 to ARGB.
LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int I400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*I400ToARGBRow)(const uint8* y_buf,
- uint8* rgb_buf,
- int width) = I400ToARGBRow_C;
- if (!src_y || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) =
+ I400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -713,8 +752,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = dst_stride_argb = 0;
@@ -743,6 +781,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width);
@@ -754,14 +800,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
// Convert J400 to ARGB.
LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int J400ToARGB(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
J400ToARGBRow_C;
- if (!src_y || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -771,8 +819,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = -src_stride_y;
}
// Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = dst_stride_argb = 0;
@@ -801,6 +848,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y;
@@ -810,85 +865,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
}
// Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
+static uvec8 kShuffleMaskBGRAToARGB = {3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u,
+ 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
// Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
+static uvec8 kShuffleMaskABGRToARGB = {2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u,
+ 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
// Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
- 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
+static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u,
+ 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
// Convert BGRA to ARGB.
LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB),
- width, height);
+int BGRAToARGB(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
}
// Convert ARGB to BGRA (same as BGRAToARGB).
LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_bgra, src_stride_bgra,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB),
- width, height);
+int ARGBToBGRA(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
}
// Convert ABGR to ARGB.
LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB),
- width, height);
+int ABGRToARGB(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
}
// Convert ARGB to ABGR to (same as ABGRToARGB).
LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_abgr, src_stride_abgr,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB),
- width, height);
+int ARGBToABGR(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
}
// Convert RGBA to ARGB.
LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return ARGBShuffle(src_rgba, src_stride_rgba,
- dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskRGBAToARGB),
- width, height);
+int RGBAToARGB(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskRGBAToARGB), width, height);
}
// Convert RGB24 to ARGB.
LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int RGB24ToARGB(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB24ToARGBRow_C;
- if (!src_rgb24 || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -898,8 +957,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
// Coalesce rows.
- if (src_stride_rgb24 == width * 3 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_rgb24 = dst_stride_argb = 0;
@@ -920,6 +978,22 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
}
}
#endif
+#if defined(HAS_RGB24TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -931,14 +1005,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
// Convert RAW to ARGB.
LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int RAWToARGB(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RAWToARGBRow_C;
- if (!src_raw || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_raw || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -948,8 +1024,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
src_stride_raw = -src_stride_raw;
}
// Coalesce rows.
- if (src_stride_raw == width * 3 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_raw = dst_stride_argb = 0;
@@ -970,6 +1045,22 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
}
}
#endif
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
@@ -981,14 +1072,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
// Convert RGB565 to ARGB.
LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int RGB565ToARGB(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
RGB565ToARGBRow_C;
- if (!src_rgb565 || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -998,8 +1091,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = -src_stride_rgb565;
}
// Coalesce rows.
- if (src_stride_rgb565 == width * 2 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_rgb565 = dst_stride_argb = 0;
@@ -1028,6 +1120,22 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
}
}
#endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1039,14 +1147,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
// Convert ARGB1555 to ARGB.
LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGB1555ToARGB(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
- int width) = ARGB1555ToARGBRow_C;
- if (!src_argb1555 || !dst_argb ||
- width <= 0 || height == 0) {
+ int width) = ARGB1555ToARGBRow_C;
+ if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1056,8 +1166,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = -src_stride_argb1555;
}
// Coalesce rows.
- if (src_stride_argb1555 == width * 2 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb1555 = dst_stride_argb = 0;
@@ -1086,6 +1195,22 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
}
}
#endif
+#if defined(HAS_ARGB1555TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1097,14 +1222,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
// Convert ARGB4444 to ARGB.
LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGB4444ToARGB(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
- int width) = ARGB4444ToARGBRow_C;
- if (!src_argb4444 || !dst_argb ||
- width <= 0 || height == 0) {
+ int width) = ARGB4444ToARGBRow_C;
+ if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1114,8 +1241,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = -src_stride_argb4444;
}
// Coalesce rows.
- if (src_stride_argb4444 == width * 2 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb4444 = dst_stride_argb = 0;
@@ -1144,6 +1270,22 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
}
}
#endif
+#if defined(HAS_ARGB4444TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -1155,18 +1297,19 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
// Convert NV12 to ARGB.
LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int NV12ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = NV12ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV12ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1199,6 +1342,22 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
@@ -1213,18 +1372,19 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
// Convert NV21 to ARGB.
LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int NV21ToARGB(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*NV21ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = NV21ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV21ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1257,6 +1417,14 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
@@ -1271,17 +1439,17 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
// Convert M420 to ARGB.
LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int M420ToARGB(const uint8* src_m420,
+ int src_stride_m420,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*NV12ToARGBRow)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb ||
- width <= 0 || height == 0) {
+ void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ NV12ToARGBRow_C;
+ if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1314,6 +1482,22 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
}
}
#endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
@@ -1332,17 +1516,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
// Convert YUY2 to ARGB.
LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int YUY2ToARGB(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*YUY2ToARGBRow)(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) =
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
YUY2ToARGBRow_C;
- if (!src_yuy2 || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1352,8 +1536,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = -src_stride_yuy2;
}
// Coalesce rows.
- if (src_stride_yuy2 == width * 2 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_yuy2 = dst_stride_argb = 0;
@@ -1382,6 +1565,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
src_yuy2 += src_stride_yuy2;
@@ -1392,17 +1583,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
// Convert UYVY to ARGB.
LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int UYVYToARGB(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*UYVYToARGBRow)(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) =
+ void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
UYVYToARGBRow_C;
- if (!src_uyvy || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1412,8 +1603,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = -src_stride_uyvy;
}
// Coalesce rows.
- if (src_stride_uyvy == width * 2 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_uyvy = dst_stride_argb = 0;
@@ -1442,6 +1632,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
src_uyvy += src_stride_uyvy;
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 46abdebc..e6ff5243 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -30,107 +30,100 @@ static __inline int Abs(int v) {
}
// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int src_y_width, int src_y_height,
- int dst_uv_width, int dst_uv_height) {
+static int I420ToI4xx(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int dst_uv_width,
+ int dst_uv_height) {
const int dst_y_width = Abs(src_y_width);
const int dst_y_height = Abs(src_y_height);
const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
- if (src_y_width == 0 || src_y_height == 0 ||
- dst_uv_width <= 0 || dst_uv_height <= 0) {
+ if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+ dst_uv_height <= 0) {
return -1;
}
- ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
- dst_y, dst_stride_y, dst_y_width, dst_y_height,
- kFilterBilinear);
- ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
- dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
- kFilterBilinear);
- ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
- dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
- kFilterBilinear);
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
return 0;
}
// 420 chroma is 1/2 width, 1/2 height
// 422 chroma is 1/2 width, 1x height
LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I420ToI422(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
const int dst_uv_width = (Abs(width) + 1) >> 1;
const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
}
// 420 chroma is 1/2 width, 1/2 height
// 444 chroma is 1x width, 1x height
LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I420ToI444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
const int dst_uv_width = Abs(width);
const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- const int dst_uv_width = (Abs(width) + 3) >> 2;
- const int dst_uv_height = Abs(height);
- return I420ToI4xx(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height,
- dst_uv_width, dst_uv_height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y ||
- width <= 0 || height == 0) {
+int I400Copy(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -144,17 +137,21 @@ int I400Copy(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
+int I422ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
int y;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) =
I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -164,10 +161,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
dst_stride_yuy2 = -dst_stride_yuy2;
}
// Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_yuy2 == width * 2) {
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -200,17 +195,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
+int I420ToYUY2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
int y;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) =
I422ToYUY2Row_C;
- if (!src_y || !src_u || !src_v || !dst_yuy2 ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -235,6 +234,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -252,17 +259,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
+int I422ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
int y;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) =
I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -272,10 +283,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
dst_stride_uyvy = -dst_stride_uyvy;
}
// Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_uyvy == width * 2) {
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -296,6 +305,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -308,17 +325,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
+int I420ToUYVY(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
int y;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) =
I422ToUYVYRow_C;
- if (!src_y || !src_u || !src_v || !dst_uyvy ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -343,6 +364,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -359,113 +388,70 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
return 0;
}
+// TODO(fbarchard): test negative height for invert.
LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
- int y;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- // Coalesce rows.
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
+int I420ToNV12(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
return -1;
}
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
- dst_stride_y = -dst_stride_y;
- dst_stride_uv = -dst_stride_uv;
- }
- if (src_stride_y == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
- // Coalesce rows.
- if (src_stride_u == halfwidth &&
- src_stride_v == halfwidth &&
- dst_stride_uv == halfwidth * 2) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_u = src_stride_v = dst_stride_uv = 0;
- }
-#if defined(HAS_MERGEUVROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- MergeUVRow_ = MergeUVRow_Any_SSE2;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- MergeUVRow_ = MergeUVRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- MergeUVRow_ = MergeUVRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- MergeUVRow_ = MergeUVRow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_NEON;
- }
- }
-#endif
-
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- for (y = 0; y < halfheight; ++y) {
- // Merge a row of U and V into a row of UV.
- MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_uv += dst_stride_uv;
+ int halfwidth = (width + 1) / 2;
+ int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+ MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+ halfwidth, halfheight);
return 0;
}
LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height) {
- return I420ToNV12(src_y, src_stride_y,
- src_v, src_stride_v,
- src_u, src_stride_u,
- dst_y, dst_stride_y,
- dst_vu, dst_stride_vu,
+int I420ToNV21(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
width, height);
}
// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
+static int I420ToRGBAMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba ||
- width <= 0 || height == 0) {
+ void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -507,6 +493,14 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -522,50 +516,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
// Convert I420 to RGBA.
LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgba, dst_stride_rgba,
- &kYuvI601Constants,
- width, height);
+int I420ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
}
// Convert I420 to BGRA.
LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_bgra, dst_stride_bgra,
+int I420ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb24, int dst_stride_rgb24,
+static int I420ToRGB24Matrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I422ToRGB24Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 ||
- width <= 0 || height == 0) {
+ void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -598,6 +600,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -613,50 +623,59 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
// Convert I420 to RGB24.
LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants,
- width, height);
+int I420ToRGB24(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
}
// Convert I420 to RAW.
LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_raw, dst_stride_raw,
+int I420ToRAW(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to ARGB1555.
LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height) {
+int I420ToARGB1555(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
int y;
- void (*I422ToARGB1555Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+ void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -689,6 +708,22 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TOARGB1555ROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
@@ -703,23 +738,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
return 0;
}
-
// Convert I420 to ARGB4444.
LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height) {
+int I420ToARGB4444(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
int y;
- void (*I422ToARGB4444Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+ void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -752,6 +789,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TOARGB4444ROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
@@ -768,20 +821,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
// Convert I420 to RGB565.
LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
+int I420ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
int y;
- void (*I422ToRGB565Row)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 ||
- width <= 0 || height == 0) {
+ void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -814,6 +869,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
@@ -829,30 +892,31 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
- 0, 4, 1, 5,
- 6, 2, 7, 3,
- 1, 5, 0, 4,
- 7, 3, 6, 2,
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert I420 to RGB565 with dithering.
LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb565, int dst_stride_rgb565,
- const uint8* dither4x4, int width, int height) {
+int I420ToRGB565Dither(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGBRow_C;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 ||
- width <= 0 || height == 0) {
+ const uint32 dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -896,6 +960,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
@@ -920,13 +992,22 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
{
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+ *(uint32*)(dither4x4 + ((y & 3) << 2)),
+ width); // NOLINT
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -941,218 +1022,156 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
// Convert I420 to specified format
LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
- const uint8* u, int u_stride,
- const uint8* v, int v_stride,
- uint8* dst_sample, int dst_sample_stride,
- int width, int height,
+int ConvertFromI420(const uint8* y,
+ int y_stride,
+ const uint8* u,
+ int u_stride,
+ const uint8* v,
+ int v_stride,
+ uint8* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
int r = 0;
- if (!y || !u|| !v || !dst_sample ||
- width <= 0 || height == 0) {
+ if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
return -1;
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
- r = I420ToYUY2(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
+ r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
break;
case FOURCC_UYVY:
- r = I420ToUYVY(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
+ r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
break;
case FOURCC_RGBP:
- r = I420ToRGB565(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 2,
- width, height);
+ r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
break;
case FOURCC_RGBO:
- r = I420ToARGB1555(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
+ r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
break;
case FOURCC_R444:
- r = I420ToARGB4444(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
+ r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width * 2,
width, height);
break;
case FOURCC_24BG:
- r = I420ToRGB24(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3,
- width, height);
+ r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
break;
case FOURCC_RAW:
- r = I420ToRAW(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 3,
- width, height);
+ r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
break;
case FOURCC_ARGB:
- r = I420ToARGB(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
+ r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
break;
case FOURCC_BGRA:
- r = I420ToBGRA(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
+ r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
break;
case FOURCC_ABGR:
- r = I420ToABGR(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
+ r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
break;
case FOURCC_RGBA:
- r = I420ToRGBA(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width * 4,
- width, height);
+ r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
break;
case FOURCC_I400:
- r = I400Copy(y, y_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
+ r = I400Copy(y, y_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
break;
case FOURCC_NV12: {
uint8* dst_uv = dst_sample + width * height;
- r = I420ToNV12(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- dst_uv,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
+ r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_uv,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
break;
}
case FOURCC_NV21: {
uint8* dst_vu = dst_sample + width * height;
- r = I420ToNV21(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample,
- dst_sample_stride ? dst_sample_stride : width,
- dst_vu,
- dst_sample_stride ? dst_sample_stride : width,
- width, height);
+ r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_vu,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
break;
}
// TODO(fbarchard): Add M420.
// Triplanar formats
- // TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:
case FOURCC_YV12: {
- int halfwidth = (width + 1) / 2;
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
int halfheight = (height + 1) / 2;
uint8* dst_u;
uint8* dst_v;
if (format == FOURCC_YV12) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + halfwidth * halfheight;
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * halfheight;
} else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + halfwidth * halfheight;
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * halfheight;
}
- r = I420Copy(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, halfwidth,
- dst_v, halfwidth,
+ r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
width, height);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
- int halfwidth = (width + 1) / 2;
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
uint8* dst_u;
uint8* dst_v;
if (format == FOURCC_YV16) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + halfwidth * height;
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * height;
} else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + halfwidth * height;
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * height;
}
- r = I420ToI422(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, halfwidth,
- dst_v, halfwidth,
+ r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
width, height);
break;
}
case FOURCC_I444:
case FOURCC_YV24: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
uint8* dst_u;
uint8* dst_v;
if (format == FOURCC_YV24) {
- dst_v = dst_sample + width * height;
- dst_u = dst_v + width * height;
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + dst_sample_stride * height;
} else {
- dst_u = dst_sample + width * height;
- dst_v = dst_u + width * height;
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + dst_sample_stride * height;
}
- r = I420ToI444(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, width,
- dst_v, width,
- width, height);
+ r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+ dst_sample_stride, width, height);
break;
}
- case FOURCC_I411: {
- int quarterwidth = (width + 3) / 4;
- uint8* dst_u = dst_sample + width * height;
- uint8* dst_v = dst_u + quarterwidth * height;
- r = I420ToI411(y, y_stride,
- u, u_stride,
- v, v_stride,
- dst_sample, width,
- dst_u, quarterwidth,
- dst_v, quarterwidth,
- width, height);
- break;
- }
-
// Formats not supported - MJPG, biplanar, some rgb formats.
default:
return -1; // unknown fourcc - return failure code.
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
index 2a8682b7..88f38279 100644
--- a/files/source/convert_from_argb.cc
+++ b/files/source/convert_from_argb.cc
@@ -22,16 +22,21 @@ extern "C" {
// ARGB little endian (bgra in memory) to I444
LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGBToI444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) = ARGBToUV444Row_C;
+ int width) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
@@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u == width &&
- dst_stride_v == width) {
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u == width && dst_stride_v == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
}
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON)
@@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_MSA;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -89,6 +100,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -103,19 +130,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
// ARGB little endian (bgra in memory) to I422
LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGBToI422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
- if (!src_argb ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -125,10 +155,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -169,82 +197,42 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
-
- for (y = 0; y < height; ++y) {
- ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYRow(src_argb, dst_y, width);
- src_argb += src_stride_argb;
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-// ARGB little endian (bgra in memory) to I411
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) = ARGBToUV411Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
- ARGBToYRow_C;
- if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- if (height < 0) {
- height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
- src_stride_argb = -src_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width &&
- dst_stride_u * 4 == width &&
- dst_stride_v * 4 == width) {
- width *= height;
- height = 1;
- src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
- }
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToYRow = ARGBToYRow_AVX2;
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
+
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
- ARGBToUV411Row = ARGBToUV411Row_NEON;
+ ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
- ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_y += dst_stride_y;
@@ -255,21 +243,23 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
+int ARGBToNV12(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- if (!src_argb ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -314,6 +304,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -338,6 +344,30 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -364,21 +394,23 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
// Same as NV12 but U and V swapped.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
+int ARGBToNV21(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- if (!src_argb ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -423,6 +455,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -447,6 +495,30 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -473,19 +545,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to YUY2.
LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height) {
+int ARGBToYUY2(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
- if (!src_argb || !dst_yuy2 ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -495,8 +570,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
dst_stride_yuy2 = -dst_stride_yuy2;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_yuy2 == width * 2) {
+ if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
@@ -537,6 +611,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -553,6 +643,30 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -575,19 +689,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to UYVY.
LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height) {
+int ARGBToUYVY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+ uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
- if (!src_argb || !dst_uyvy ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -597,8 +714,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
dst_stride_uyvy = -dst_stride_uyvy;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_uyvy == width * 2) {
+ if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
@@ -639,6 +755,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -655,6 +787,30 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -677,9 +833,12 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to I400.
LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+int ARGBToI400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C;
@@ -692,8 +851,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_y == width) {
+ if (src_stride_argb == width * 4 && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = 0;
@@ -722,6 +880,22 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ARGBToYRow = ARGBToYRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYRow(src_argb, dst_y, width);
@@ -732,26 +906,29 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
// Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
- 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
+static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u,
+ 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
// Convert ARGB to RGBA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- return ARGBShuffle(src_argb, src_stride_argb,
- dst_rgba, dst_stride_rgba,
- (const uint8*)(&kShuffleMaskARGBToRGBA),
- width, height);
+int ARGBToRGBA(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+ (const uint8*)(&kShuffleMaskARGBToRGBA), width, height);
}
// Convert ARGB To RGB24.
LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
+int ARGBToRGB24(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
int y;
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB24Row_C;
@@ -764,8 +941,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_rgb24 == width * 3) {
+ if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
width *= height;
height = 1;
src_stride_argb = dst_stride_rgb24 = 0;
@@ -786,6 +962,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -797,9 +981,12 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To RAW.
LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
+int ARGBToRAW(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
int y;
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRAWRow_C;
@@ -812,8 +999,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_raw == width * 3) {
+ if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
width *= height;
height = 1;
src_stride_argb = dst_stride_raw = 0;
@@ -834,6 +1020,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRAWRow(src_argb, dst_raw, width);
@@ -845,20 +1039,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
- 0, 4, 1, 5,
- 6, 2, 7, 3,
- 1, 5, 0, 4,
- 7, 3, 6, 2,
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- const uint8* dither4x4, int width, int height) {
+int ARGBToRGB565Dither(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8* dither4x4,
+ int width,
+ int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+ const uint32 dither4, int width) =
+ ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@@ -894,9 +1090,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
- *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+ *(uint32*)(dither4x4 + ((y & 3) << 2)),
+ width); /* NOLINT */
src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565;
}
@@ -906,9 +1112,12 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To RGB565.
// TODO(fbarchard): Consider using dither function low level with zeros.
LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
+int ARGBToRGB565(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
int y;
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB565Row_C;
@@ -921,8 +1130,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_rgb565 == width * 2) {
+ if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_rgb565 = 0;
@@ -951,6 +1159,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -962,9 +1178,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To ARGB1555.
LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height) {
+int ARGBToARGB1555(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
int y;
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB1555Row_C;
@@ -977,8 +1196,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb1555 == width * 2) {
+ if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb1555 = 0;
@@ -1007,6 +1225,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1018,9 +1244,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
// Convert ARGB To ARGB4444.
LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height) {
+int ARGBToARGB4444(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
int y;
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB4444Row_C;
@@ -1033,8 +1262,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb4444 == width * 2) {
+ if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb4444 = 0;
@@ -1063,6 +1291,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1074,19 +1310,22 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGBToJ420(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
- if (!src_argb ||
- !dst_yj || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1129,6 +1368,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1148,19 +1403,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J422. (JPeg full range I422).
LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int ARGBToJ422(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
- if (!src_argb ||
- !dst_yj || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1170,10 +1428,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_yj == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
+ if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
@@ -1212,6 +1468,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
@@ -1226,9 +1498,12 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to J400.
LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- int width, int height) {
+int ARGBToJ400(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
int y;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C;
@@ -1241,8 +1516,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_yj == width) {
+ if (src_stride_argb == width * 4 && dst_stride_yj == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yj = 0;
@@ -1271,6 +1545,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYJRow(src_argb, dst_yj, width);
diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
index 90f550a2..216a9f26 100644
--- a/files/source/convert_jpeg.cc
+++ b/files/source/convert_jpeg.cc
@@ -37,13 +37,9 @@ static void JpegCopyI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
- I420Copy(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
+ I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -55,13 +51,9 @@ static void JpegI422ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
- I422ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
+ I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -73,31 +65,9 @@ static void JpegI444ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
- I444ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
- dest->y += rows * dest->y_stride;
- dest->u += ((rows + 1) >> 1) * dest->u_stride;
- dest->v += ((rows + 1) >> 1) * dest->v_stride;
- dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- I420Buffers* dest = (I420Buffers*)(opaque);
- I411ToI420(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
+ I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -109,11 +79,8 @@ static void JpegI400ToI420(void* opaque,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
- I400ToI420(data[0], strides[0],
- dest->y, dest->y_stride,
- dest->u, dest->u_stride,
- dest->v, dest->v_stride,
- dest->w, rows);
+ I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+ dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -122,8 +89,7 @@ static void JpegI400ToI420(void* opaque,
// Query size of MJPG in pixels.
LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
- int* width, int* height) {
+int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) {
@@ -139,11 +105,16 @@ int MJPGSize(const uint8* sample, size_t sample_size,
LIBYUV_API
int MJPGToI420(const uint8* sample,
size_t sample_size,
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int w, int h,
- int dw, int dh) {
+ uint8* y,
+ int y_stride,
+ uint8* u,
+ int u_stride,
+ uint8* v,
+ int v_stride,
+ int w,
+ int h,
+ int dw,
+ int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@@ -152,17 +123,16 @@ int MJPGToI420(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
+ if (ret &&
+ (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
- I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+ I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
// YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -171,7 +141,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
- // YUV422
+ // YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -182,7 +152,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
- // YUV444
+ // YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -193,18 +163,7 @@ int MJPGToI420(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
- // YUV400
+ // YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
@@ -213,7 +172,7 @@ int MJPGToI420(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
+ // factors that occur in practice.
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
@@ -231,57 +190,34 @@ struct ARGBBuffers {
};
static void JpegI420ToARGB(void* opaque,
- const uint8* const* data,
- const int* strides,
- int rows) {
- ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I420ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
- dest->argb += rows * dest->argb_stride;
- dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I422ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
+ I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
-static void JpegI444ToARGB(void* opaque,
+static void JpegI422ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I444ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
+ I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
-static void JpegI411ToARGB(void* opaque,
+static void JpegI444ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I411ToARGB(data[0], strides[0],
- data[1], strides[1],
- data[2], strides[2],
- dest->argb, dest->argb_stride,
- dest->w, rows);
+ I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@@ -291,9 +227,7 @@ static void JpegI400ToARGB(void* opaque,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
- I400ToARGB(data[0], strides[0],
- dest->argb, dest->argb_stride,
- dest->w, rows);
+ I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
@@ -303,9 +237,12 @@ static void JpegI400ToARGB(void* opaque,
LIBYUV_API
int MJPGToARGB(const uint8* sample,
size_t sample_size,
- uint8* argb, int argb_stride,
- int w, int h,
- int dw, int dh) {
+ uint8* argb,
+ int argb_stride,
+ int w,
+ int h,
+ int dw,
+ int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
@@ -314,17 +251,16 @@ int MJPGToARGB(const uint8* sample,
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret && (mjpeg_decoder.GetWidth() != w ||
- mjpeg_decoder.GetHeight() != h)) {
+ if (ret &&
+ (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
- ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+ ARGBBuffers bufs = {argb, argb_stride, dw, dh};
// YUV420
- if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -333,7 +269,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
- // YUV422
+ // YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -344,7 +280,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
- // YUV444
+ // YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -355,18 +291,7 @@ int MJPGToARGB(const uint8* sample,
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
- // YUV411
- } else if (mjpeg_decoder.GetColorSpace() ==
- MJpegDecoder::kColorSpaceYCbCr &&
- mjpeg_decoder.GetNumComponents() == 3 &&
- mjpeg_decoder.GetVertSampFactor(0) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
- mjpeg_decoder.GetVertSampFactor(1) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
- mjpeg_decoder.GetVertSampFactor(2) == 1 &&
- mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
- // YUV400
+ // YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
@@ -375,7 +300,7 @@ int MJPGToARGB(const uint8* sample,
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice. 411 is supported by libjpeg
+ // factors that occur in practice.
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
index bccb34c3..63a5104b 100644
--- a/files/source/convert_to_argb.cc
+++ b/files/source/convert_to_argb.cc
@@ -29,11 +29,16 @@ extern "C" {
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
- uint8* crop_argb, int argb_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
+int ConvertToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* crop_argb,
+ int argb_stride,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@@ -49,16 +54,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// and then rotate the ARGB to the final destination buffer.
// For in-place conversion, if destination crop_argb is same as source sample,
// also enable temporary buffer.
- LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
- crop_argb == sample;
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_ARGB) || crop_argb == sample;
uint8* dest_argb = crop_argb;
int dest_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
- if (crop_argb == NULL || sample == NULL ||
- src_width <= 0 || crop_width <= 0 ||
- src_height == 0 || crop_height == 0) {
+ if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
+ crop_width <= 0 || src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
@@ -67,7 +71,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
- rotate_buffer = (uint8*)malloc(argb_size);
+ rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@@ -79,102 +83,85 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToARGB(src, aligned_src_width * 2,
- crop_argb, argb_stride,
+ r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToARGB(src, aligned_src_width * 2,
- crop_argb, argb_stride,
+ r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToARGB(src, src_width * 3,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToARGB(src, src_width * 3,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_ARGB:
- if (!need_buf && !rotation ) {
+ if (!need_buf && !rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
}
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToARGB(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToARGB(src, src_width * 2,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
- r = I400ToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToARGB(src, src_width,
- src_uv, aligned_src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+ argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
- r = NV21ToARGB(src, src_width,
- src_uv, aligned_src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+ argb_stride, crop_width, inv_crop_height);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
@@ -186,20 +173,17 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ (halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ (halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
- r = I420ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@@ -210,14 +194,11 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ (halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- r = J420ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
@@ -228,21 +209,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
- r = I422ToARGB(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
@@ -257,32 +235,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
- r = I444ToARGB(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToARGB(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- crop_argb, argb_stride,
- crop_width, inv_crop_height);
+ r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ crop_argb, argb_stride, crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
- r = MJPGToARGB(sample, sample_size,
- crop_argb, argb_stride,
- src_width, abs_src_height, crop_width, inv_crop_height);
+ r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
+ abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
@@ -291,16 +251,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
if (!r) {
- r = ARGBRotate(crop_argb, argb_stride,
- dest_argb, dest_argb_stride,
+ r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
} else if (rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBRotate(src, src_width * 4,
- crop_argb, argb_stride,
- crop_width, inv_crop_height, rotation);
+ r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ inv_crop_height, rotation);
}
return r;
diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
index e5f307c4..a50689db 100644
--- a/files/source/convert_to_i420.cc
+++ b/files/source/convert_to_i420.cc
@@ -27,12 +27,18 @@ extern "C" {
LIBYUV_API
int ConvertToI420(const uint8* sample,
size_t sample_size,
- uint8* y, int y_stride,
- uint8* u, int u_stride,
- uint8* v, int v_stride,
- int crop_x, int crop_y,
- int src_width, int src_height,
- int crop_width, int crop_height,
+ uint8* y,
+ int y_stride,
+ uint8* u,
+ int u_stride,
+ uint8* v,
+ int v_stride,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
@@ -43,9 +49,10 @@ int ConvertToI420(const uint8* sample,
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
- LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
- format != FOURCC_NV12 && format != FOURCC_NV21 &&
- format != FOURCC_YV12) || y == sample;
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+ format != FOURCC_NV21 && format != FOURCC_YV12) ||
+ y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@@ -56,8 +63,7 @@ int ConvertToI420(const uint8* sample,
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
- if (!y || !u || !v || !sample ||
- src_width <= 0 || crop_width <= 0 ||
+ if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
@@ -70,7 +76,7 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
- rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+ rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
@@ -85,130 +91,85 @@ int ConvertToI420(const uint8* sample,
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToI420(src, src_width * 2,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+ v_stride, crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToI420(src, src_width * 3,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToI420(src, src_width * 4,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
- r = I400ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
- ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height, rotation);
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height, rotation);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
- ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
- r = NV12ToI420Rotate(src, src_width,
- src_uv, aligned_src_width,
- y, y_stride,
- v, v_stride,
- u, u_stride,
- crop_width, inv_crop_height, rotation);
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+ y_stride, v, v_stride, u, u_stride, crop_width,
+ inv_crop_height, rotation);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
@@ -221,22 +182,18 @@ int ConvertToI420(const uint8* sample,
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ (halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ (halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
- r = I420Rotate(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height, rotation);
+ r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height, rotation);
break;
}
case FOURCC_I422:
@@ -246,23 +203,19 @@ int ConvertToI420(const uint8* sample,
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
- src_v = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
- src_u = sample + src_width * abs_src_height +
- halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
- r = I422ToI420(src_y, src_width,
- src_u, halfwidth,
- src_v, halfwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height);
break;
}
case FOURCC_I444:
@@ -277,37 +230,14 @@ int ConvertToI420(const uint8* sample,
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
- r = I444ToI420(src_y, src_width,
- src_u, src_width,
- src_v, src_width,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
- break;
- }
- case FOURCC_I411: {
- int quarterwidth = (src_width + 3) / 4;
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u = sample + src_width * abs_src_height +
- quarterwidth * crop_y + crop_x / 4;
- const uint8* src_v = sample + src_width * abs_src_height +
- quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
- r = I411ToI420(src_y, src_width,
- src_u, quarterwidth,
- src_v, quarterwidth,
- y, y_stride,
- u, u_stride,
- v, v_stride,
- crop_width, inv_crop_height);
+ r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
+ y_stride, u, u_stride, v, v_stride, crop_width,
+ inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
- r = MJPGToI420(sample, sample_size,
- y, y_stride,
- u, u_stride,
- v, v_stride,
+ r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
break;
#endif
@@ -317,13 +247,9 @@ int ConvertToI420(const uint8* sample,
if (need_buf) {
if (!r) {
- r = I420Rotate(y, y_stride,
- u, u_stride,
- v, v_stride,
- tmp_y, tmp_y_stride,
- tmp_u, tmp_u_stride,
- tmp_v, tmp_v_stride,
- crop_width, abs_crop_height, rotation);
+ r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
+ abs_crop_height, rotation);
}
free(rotate_buffer);
}
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 84927ebc..9ff93263 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -13,7 +13,7 @@
#if defined(_MSC_VER)
#include <intrin.h> // For __cpuidex()
#endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
@@ -44,8 +44,8 @@ extern "C" {
#endif
// Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
- defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
@@ -68,24 +68,24 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
- cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
uint32 info_ebx, info_edx;
- asm volatile (
-#if defined( __i386__) && defined(__PIC__)
- // Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=D" (info_ebx),
+ asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D"(info_ebx),
#else
- "cpuid \n"
- : "=b" (info_ebx),
+ "cpuid \n"
+ : "=b"(info_ebx),
#endif // defined( __i386__) && defined(__PIC__)
- "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+ "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
cpu_info[0] = info_eax;
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
@@ -95,6 +95,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+ (void)eax;
+ (void)ecx;
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
@@ -111,20 +113,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
- defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
uint32 xcr0 = 0u;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(__i386__) || defined(__x86_64__)
- asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
}
+#else
+// xgetbv unavailable to query for OSSave support. Return 0.
+#define GetXCR0() 0
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
@@ -133,8 +137,7 @@ int GetXCR0() {
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
@@ -161,6 +164,38 @@ int ArmCpuCaps(const char* cpuinfo_name) {
return 0;
}
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+ const char ase[]) {
+ char cpuinfo_line[512];
+ int len = (int)strlen(ase);
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // ase enabled if /proc/cpuinfo is unavailable.
+ if (strcmp(ase, " msa") == 0) {
+ return kCpuHasMSA;
+ }
+ if (strcmp(ase, " dspr2") == 0) {
+ return kCpuHasDSPR2;
+ }
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ char* p = strstr(cpuinfo_line, ase);
+ if (p && (p[len] == ' ' || p[len] == '\n')) {
+ fclose(f);
+ if (strcmp(ase, " msa") == 0) {
+ return kCpuHasMSA;
+ }
+ if (strcmp(ase, " dspr2") == 0) {
+ return kCpuHasDSPR2;
+ }
+ }
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = 0; // cpu_info is not initialized yet.
@@ -184,39 +219,35 @@ static LIBYUV_BOOL TestEnv(const char*) {
}
#endif
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
- // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
- uint32 cpu_info0[4] = { 0, 0, 0, 0 };
- uint32 cpu_info1[4] = { 0, 0, 0, 0 };
- uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+ uint32 cpu_info0[4] = {0, 0, 0, 0};
+ uint32 cpu_info1[4] = {0, 0, 0, 0};
+ uint32 cpu_info7[4] = {0, 0, 0, 0};
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
- cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
- ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
- ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
- kCpuHasX86;
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
-#ifdef HAS_XGETBV
- // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+ // AVX requires OS saves YMM registers.
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
- cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+ cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
}
-#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
@@ -249,15 +280,25 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
}
+ if (TestEnv("LIBYUV_DISABLE_F16C")) {
+ cpu_info &= ~kCpuHasF16C;
+ }
+
#endif
#if defined(__mips__) && defined(__linux__)
#if defined(__mips_dspr2)
cpu_info |= kCpuHasDSPR2;
#endif
+#if defined(__mips_msa)
+ cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
+#endif
cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
cpu_info &= ~kCpuHasDSPR2;
}
+ if (getenv("LIBYUV_DISABLE_MSA")) {
+ cpu_info &= ~kCpuHasMSA;
+ }
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
@@ -283,7 +324,7 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
}
- cpu_info |= kCpuInitialized;
+ cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
}
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index 50818418..b43c008b 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -21,7 +21,7 @@
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
+#pragma warning(disable : 4324)
#endif
#endif
@@ -62,6 +62,7 @@ void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
MJpegDecoder::MJpegDecoder()
: has_scanline_padding_(LIBYUV_FALSE),
@@ -77,6 +78,7 @@ MJpegDecoder::MJpegDecoder()
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
// Override standard exit()-based error handler.
error_mgr_->base.error_exit = &ErrorHandler;
+ error_mgr_->base.output_message = &OutputHandler;
#endif
decompress_struct_->client_data = NULL;
source_mgr_->init_source = &init_source;
@@ -127,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
if (scanlines_[i]) {
delete scanlines_[i];
}
- scanlines_[i] = new uint8* [scanlines_size];
+ scanlines_[i] = new uint8*[scanlines_size];
scanlines_sizes_[i] = scanlines_size;
}
@@ -193,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
}
int MJpegDecoder::GetHorizSubSampFactor(int component) {
- return decompress_struct_->max_h_samp_factor /
- GetHorizSampFactor(component);
+ return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
}
int MJpegDecoder::GetVertSubSampFactor(int component) {
- return decompress_struct_->max_v_samp_factor /
- GetVertSampFactor(component);
+ return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
}
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@@ -243,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
- uint8** planes, int dst_width, int dst_height) {
- if (dst_width != GetWidth() ||
- dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@@ -287,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
assert(skip % GetVertSubSampFactor(i) == 0);
- int rows_to_skip =
- DivideAndRoundDown(skip, GetVertSubSampFactor(i));
- int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
- rows_to_skip;
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy =
+ GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
int data_to_skip = rows_to_skip * GetComponentStride(i);
- CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i),
+ scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -303,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
// Read full MCUs but cropped horizontally
for (; lines_left > GetImageScanlinesPerImcuRow();
- lines_left -= GetImageScanlinesPerImcuRow()) {
+ lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
- CopyPlane(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
@@ -326,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy =
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
- CopyPlane(databuf_[i], GetComponentStride(i),
- planes[i], GetComponentWidth(i),
- GetComponentWidth(i), scanlines_to_copy);
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
return FinishDecode();
}
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
- int dst_width, int dst_height) {
- if (dst_width != GetWidth() ||
- dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
@@ -393,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
}
// Read full MCUs until we get to the crop point.
for (; lines_left >= GetImageScanlinesPerImcuRow();
- lines_left -= GetImageScanlinesPerImcuRow()) {
+ lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
@@ -433,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
}
void term_source(j_decompress_ptr cinfo) {
- // Nothing to do.
+ (void)cinfo; // Nothing to do.
}
#ifdef HAVE_SETJMP
void ErrorHandler(j_common_ptr cinfo) {
- // This is called when a jpeglib command experiences an error. Unfortunately
- // jpeglib's error handling model is not very flexible, because it expects the
- // error handler to not return--i.e., it wants the program to terminate. To
- // recover from errors we use setjmp() as shown in their example. setjmp() is
- // C's implementation for the "call with current continuation" functionality
- // seen in some functional programming languages.
- // A formatted message can be output, but is unsafe for release.
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
- // ERROR: Error in jpeglib: buf
+// ERROR: Error in jpeglib: buf
#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@@ -456,7 +454,13 @@ void ErrorHandler(j_common_ptr cinfo) {
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
}
-#endif
+
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+ (void)cinfo;
+}
+
+#endif // HAVE_SETJMP
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
if (num_outbufs != num_outbufs_) {
@@ -465,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
// it.
DestroyOutputBuffers();
- scanlines_ = new uint8** [num_outbufs];
+ scanlines_ = new uint8**[num_outbufs];
scanlines_sizes_ = new int[num_outbufs];
- databuf_ = new uint8* [num_outbufs];
+ databuf_ = new uint8*[num_outbufs];
databuf_strides_ = new int[num_outbufs];
for (int i = 0; i < num_outbufs; ++i) {
@@ -483,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
void MJpegDecoder::DestroyOutputBuffers() {
for (int i = 0; i < num_outbufs_; ++i) {
- delete [] scanlines_[i];
- delete [] databuf_[i];
+ delete[] scanlines_[i];
+ delete[] databuf_[i];
}
- delete [] scanlines_;
- delete [] databuf_;
- delete [] scanlines_sizes_;
- delete [] databuf_strides_;
+ delete[] scanlines_;
+ delete[] databuf_;
+ delete[] scanlines_sizes_;
+ delete[] databuf_strides_;
scanlines_ = NULL;
databuf_ = NULL;
scanlines_sizes_ = NULL;
@@ -535,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
- jpeg_read_raw_data(decompress_struct_,
- scanlines_,
- GetImageScanlinesPerImcuRow());
+ jpeg_read_raw_data(decompress_struct_, scanlines_,
+ GetImageScanlinesPerImcuRow());
}
// The helper function which recognizes the jpeg sub-sampling type.
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
- int* subsample_x, int* subsample_y, int number_of_components) {
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components) {
if (number_of_components == 3) { // Color images.
- if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 2 && subsample_y[1] == 2 &&
- subsample_x[2] == 2 && subsample_y[2] == 2) {
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+ subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
return kJpegYuv420;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 2 && subsample_y[1] == 1 &&
- subsample_x[2] == 2 && subsample_y[2] == 1) {
+ subsample_x[1] == 2 && subsample_y[1] == 1 &&
+ subsample_x[2] == 2 && subsample_y[2] == 1) {
return kJpegYuv422;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 1 && subsample_y[1] == 1 &&
- subsample_x[2] == 1 && subsample_y[2] == 1) {
+ subsample_x[1] == 1 && subsample_y[1] == 1 &&
+ subsample_x[2] == 1 && subsample_y[2] == 1) {
return kJpegYuv444;
}
} else if (number_of_components == 1) { // Grey-scale images.
@@ -567,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
} // namespace libyuv
#endif // HAVE_JPEG
-
diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc
index 9c488320..1a17dd72 100644
--- a/files/source/mjpeg_validate.cc
+++ b/files/source/mjpeg_validate.cc
@@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
- it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+ it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
@@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index 237ab683..b8a53e85 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -26,14 +26,22 @@ extern "C" {
// Copy a plane of data
LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+void CopyPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
// Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_y == width) {
+ if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y = dst_stride_y = 0;
@@ -76,15 +84,19 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
}
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
- uint16* dst_y, int dst_stride_y,
- int width, int height) {
+void CopyPlane_16(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
int y;
void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
// Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_y == width) {
+ if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y = dst_stride_y = 0;
@@ -120,17 +132,22 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
// Copy I422.
LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I422Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -143,7 +160,10 @@ int I422Copy(const uint8* src_y, int src_stride_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
return 0;
@@ -151,16 +171,21 @@ int I422Copy(const uint8* src_y, int src_stride_y,
// Copy I444.
LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+int I444Copy(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -174,7 +199,9 @@ int I444Copy(const uint8* src_y, int src_stride_y,
src_stride_v = -src_stride_v;
}
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
@@ -182,9 +209,12 @@ int I444Copy(const uint8* src_y, int src_stride_y,
// Copy I400.
LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+int I400ToI400(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
@@ -200,11 +230,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
// Convert I420 to I400.
LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+int I420ToI400(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ (void)src_u;
+ (void)src_stride_u;
+ (void)src_v;
+ (void)src_stride_v;
if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
@@ -214,14 +253,159 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
+
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
return 0;
}
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) = SplitUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) &&
+ IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) &&
+ IS_ALIGNED(dst_stride_v, 4)) {
+ SplitUVRow = SplitUVRow_Any_DSPR2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of UV.
+ SplitUVRow(src_uv, dst_u, dst_v, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Coalesce rows.
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_u == width && src_stride_v == width &&
+ dst_stride_uv == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow(src_u, src_v, dst_uv, width);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+void MirrorPlane(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
int y;
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
// Negative height means invert the image.
@@ -256,12 +440,20 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) &&
+ IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) &&
+ IS_ALIGNED(dst_stride_y, 4)) {
MirrorRow = MirrorRow_DSPR2;
}
#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -273,17 +465,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
// Convert YUY2 to I422.
LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int YUY2ToI422(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*YUY2ToUV422Row)(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) =
- YUY2ToUV422Row_C;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int width) = YUY2ToUV422Row_C;
void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -291,10 +490,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = -src_stride_yuy2;
}
// Coalesce rows.
- if (src_stride_yuy2 == width * 2 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
width *= height;
height = 1;
src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -322,15 +520,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
- if (width >= 16) {
- YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -345,17 +551,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
// Convert UYVY to I422.
LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int UYVYToI422(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
- void (*UYVYToUV422Row)(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) =
- UYVYToUV422Row_C;
- void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int width) = UYVYToYRow_C;
+ void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int width) = UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ UYVYToYRow_C;
+ if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -363,10 +576,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = -src_stride_uyvy;
}
// Coalesce rows.
- if (src_stride_uyvy == width * 2 &&
- dst_stride_y == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
+ if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
width *= height;
height = 1;
src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -394,15 +606,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
UYVYToYRow = UYVYToYRow_Any_NEON;
- if (width >= 16) {
- UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
- }
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUV422Row = UYVYToUV422Row_NEON;
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -415,13 +635,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
return 0;
}
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- if (!src_y || !dst_y ||
- width <= 0 || height == 0) {
+int I400Mirror(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -437,17 +726,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y,
// Mirror I420 with optional flipping
LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I420Mirror(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -472,9 +768,12 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
// ARGB mirror.
LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBMirror(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
ARGBMirrorRow_C;
@@ -511,6 +810,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -544,10 +851,14 @@ ARGBBlendRow GetARGBBlend() {
// Alpha Blend 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBBlend(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width) = GetARGBBlend();
@@ -561,8 +872,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
@@ -580,14 +890,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
// Alpha Blend plane and store to destination.
LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+int BlendPlane(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
int y;
void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+ const uint8* alpha, uint8* dst, int width) =
+ BlendPlaneRow_C;
if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
return -1;
}
@@ -599,10 +915,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
}
// Coalesce rows for Y plane.
- if (src_stride_y0 == width &&
- src_stride_y1 == width &&
- alpha_stride == width &&
- dst_stride_y == width) {
+ if (src_stride_y0 == width && src_stride_y1 == width &&
+ alpha_stride == width && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
@@ -610,7 +924,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
#if defined(HAS_BLENDPLANEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
BlendPlaneRow = BlendPlaneRow_SSSE3;
}
@@ -618,7 +932,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
#endif
#if defined(HAS_BLENDPLANEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
BlendPlaneRow = BlendPlaneRow_AVX2;
}
@@ -638,22 +952,34 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
#define MAXTWIDTH 2048
// Alpha Blend YUV images and store to destination.
LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
- const uint8* src_u0, int src_stride_u0,
- const uint8* src_v0, int src_stride_v0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* src_u1, int src_stride_u1,
- const uint8* src_v1, int src_stride_v1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+int I420Blend(const uint8* src_y0,
+ int src_stride_y0,
+ const uint8* src_u0,
+ int src_stride_u0,
+ const uint8* src_v0,
+ int src_stride_v0,
+ const uint8* src_y1,
+ int src_stride_y1,
+ const uint8* src_u1,
+ int src_stride_u1,
+ const uint8* src_v1,
+ int src_stride_v1,
+ const uint8* alpha,
+ int alpha_stride,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int y;
// Half width/height for UV.
int halfwidth = (width + 1) >> 1;
void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+ const uint8* alpha, uint8* dst, int width) =
+ BlendPlaneRow_C;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
@@ -669,11 +995,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
}
// Blend Y plane.
- BlendPlane(src_y0, src_stride_y0,
- src_y1, src_stride_y1,
- alpha, alpha_stride,
- dst_y, dst_stride_y,
- width, height);
+ BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+ dst_y, dst_stride_y, width, height);
#if defined(HAS_BLENDPLANEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -753,10 +1076,14 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
// Multiply 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBMultiply(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBMultiplyRow_C;
@@ -770,8 +1097,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
@@ -801,6 +1127,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+ }
+ }
+#endif
// Multiply plane
for (y = 0; y < height; ++y) {
@@ -814,10 +1148,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
// Add 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBAdd(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBAddRow_C;
@@ -831,8 +1169,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
@@ -867,6 +1204,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
}
#endif
+#if defined(HAS_ARGBADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAddRow = ARGBAddRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_MSA;
+ }
+ }
+#endif
// Add plane
for (y = 0; y < height; ++y) {
@@ -880,10 +1225,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
// Subtract 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBSubtract(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBSubtractRow_C;
@@ -897,8 +1246,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb0 == width * 4 &&
- src_stride_argb1 == width * 4 &&
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
@@ -928,6 +1276,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
}
#endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_MSA;
+ }
+ }
+#endif
// Subtract plane
for (y = 0; y < height; ++y) {
@@ -939,21 +1295,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
return 0;
}
// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
+static int I422ToRGBAMatrix(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba ||
- width <= 0 || height == 0) {
+ void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -995,6 +1353,14 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -1008,48 +1374,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
// Convert I422 to RGBA.
LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- int width, int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgba, dst_stride_rgba,
- &kYuvI601Constants,
- width, height);
+int I422ToRGBA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
}
// Convert I422 to BGRA.
LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
- int width, int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_bgra, dst_stride_bgra,
+int I422ToBGRA(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert NV12 to RGB565.
LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height) {
+int NV12ToRGB565(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
int y;
- void (*NV12ToRGB565Row)(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 ||
- width <= 0 || height == 0) {
+ void (*NV12ToRGB565Row)(
+ const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1082,6 +1455,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
}
}
#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
@@ -1096,14 +1477,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
// Convert RAW to RGB24.
LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
+int RAWToRGB24(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
int y;
void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
RAWToRGB24Row_C;
- if (!src_raw || !dst_rgb24 ||
- width <= 0 || height == 0) {
+ if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1113,8 +1496,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
src_stride_raw = -src_stride_raw;
}
// Coalesce rows.
- if (src_stride_raw == width * 3 &&
- dst_stride_rgb24 == width * 3) {
+ if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
width *= height;
height = 1;
src_stride_raw = dst_stride_rgb24 = 0;
@@ -1135,6 +1517,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
}
}
#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1145,11 +1535,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
}
LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
- int width, int height,
+void SetPlane(uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
uint32 value) {
int y;
- void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+ void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C;
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1192,22 +1584,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
// Draw a rectangle into I420
LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int x, int y,
- int width, int height,
- int value_y, int value_u, int value_v) {
+int I420Rect(uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
uint8* start_y = dst_y + y * dst_stride_y + x;
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
- if (!dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0 ||
- x < 0 || y < 0 ||
- value_y < 0 || value_y > 255 ||
- value_u < 0 || value_u > 255 ||
+ if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+ y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
return -1;
}
@@ -1220,15 +1616,16 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
// Draw a rectangle into ARGB
LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y,
- int width, int height,
+int ARGBRect(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height,
uint32 value) {
int y;
- void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
- if (!dst_argb ||
- width <= 0 || height == 0 ||
- dst_x < 0 || dst_y < 0) {
+ void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C;
+ if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
if (height < 0) {
@@ -1257,6 +1654,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
ARGBSetRow = ARGBSetRow_X86;
}
#endif
+#if defined(HAS_ARGBSETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSetRow = ARGBSetRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MSA;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -1280,12 +1685,15 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
// f is foreground pixel premultiplied by alpha
LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBAttenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBAttenuateRow_C;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBAttenuateRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1295,8 +1703,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -1325,6 +1732,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1336,9 +1751,12 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBUnattenuate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBUnattenuateRow_C;
@@ -1351,8 +1769,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -1373,7 +1790,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
}
}
#endif
-// TODO(fbarchard): Neon version.
+ // TODO(fbarchard): Neon version.
for (y = 0; y < height; ++y) {
ARGBUnattenuateRow(src_argb, dst_argb, width);
@@ -1385,12 +1802,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
// Convert ARGB to Grayed ARGB.
LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBGrayTo(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBGrayRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1400,8 +1820,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -1416,6 +1835,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
@@ -1427,12 +1851,15 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
// Make a rectangle of ARGB gray scale.
LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y,
- int width, int height) {
+int ARGBGray(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBGrayRow_C;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBGrayRow_C;
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
@@ -1453,6 +1880,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
+
for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
dst += dst_stride_argb;
@@ -1462,10 +1895,14 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
- int dst_x, int dst_y, int width, int height) {
+int ARGBSepia(uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
int y;
- void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+ void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C;
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
@@ -1486,6 +1923,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_MSA;
+ }
+#endif
+
for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
dst += dst_stride_argb;
@@ -1496,13 +1939,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
// Apply a 4x4 matrix to each ARGB pixel.
// Note: Normally for shading, but can be used to swizzle or invert.
LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const int8* matrix_argb,
- int width, int height) {
+ int width,
+ int height) {
int y;
void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+ const int8* matrix_argb, int width) =
+ ARGBColorMatrixRow_C;
if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1512,8 +1959,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -1539,13 +1985,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
// Apply a 4x3 matrix to each ARGB pixel.
// Deprecated.
LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int RGBColorMatrix(uint8* dst_argb,
+ int dst_stride_argb,
const int8* matrix_rgb,
- int dst_x, int dst_y, int width, int height) {
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
SIMD_ALIGNED(int8 matrix_argb[16]);
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
return -1;
}
@@ -1565,23 +2015,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
matrix_argb[15] = 64; // 1.0
- return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
- dst, dst_stride_argb,
- &matrix_argb[0], width, height);
+ return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst,
+ dst_stride_argb, &matrix_argb[0], width, height);
}
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
const uint8* table_argb,
- int dst_x, int dst_y, int width, int height) {
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
int y;
- void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
int width) = ARGBColorTableRow_C;
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
return -1;
}
// Coalesce rows.
@@ -1605,15 +2058,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int RGBColorTable(uint8* dst_argb,
+ int dst_stride_argb,
const uint8* table_argb,
- int dst_x, int dst_y, int width, int height) {
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
int y;
- void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
int width) = RGBColorTableRow_C;
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
- if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
return -1;
}
// Coalesce rows.
@@ -1644,11 +2101,17 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
// Caveat - although SSE2 saturates, the C function does not and should be used
// with care if doing anything but quantization.
LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
- int scale, int interval_size, int interval_offset,
- int dst_x, int dst_y, int width, int height) {
+int ARGBQuantize(uint8* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
int y;
- void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+ void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size,
int interval_offset, int width) = ARGBQuantizeRow_C;
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -1681,12 +2144,16 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height) {
+int ARGBComputeCumulativeSum(const uint8* src_argb,
+ int src_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height) {
int y;
void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ const int32* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
int32* previous_cumsum = dst_cumsum;
if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
return -1;
@@ -1711,15 +2178,22 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
// as the buffer is treated as circular.
LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int32* dst_cumsum, int dst_stride32_cumsum,
- int width, int height, int radius) {
+int ARGBBlur(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int32* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius) {
int y;
- void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
- const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+ int width, int area, uint8* dst,
+ int count) = CumulativeSumToAverageRow_C;
int32* cumsum_bot_row;
int32* max_cumsum_bot_row;
int32* cumsum_top_row;
@@ -1749,9 +2223,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
- ARGBComputeCumulativeSum(src_argb, src_stride_argb,
- dst_cumsum, dst_stride32_cumsum,
- width, radius);
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+ dst_stride32_cumsum, width, radius);
src_argb = src_argb + radius * src_stride_argb;
cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
@@ -1789,24 +2262,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
// Left clipped.
for (x = 0; x < radius + 1; ++x) {
- CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], 1);
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], 1);
area += (bot_y - top_y);
boxwidth += 4;
}
// Middle unclipped.
n = (width - 1) - radius - x + 1;
- CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], n);
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], n);
// Right clipped.
for (x += n; x <= width - 1; ++x) {
area -= (bot_y - top_y);
boxwidth -= 4;
CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
- cumsum_bot_row + (x - radius - 1) * 4,
- boxwidth, area, &dst_argb[x * 4], 1);
+ cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+ area, &dst_argb[x * 4], 1);
}
dst_argb += dst_stride_argb;
}
@@ -1815,12 +2288,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
// Multiply ARGB image by a specified ARGB value.
LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, uint32 value) {
+int ARGBShade(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32 value) {
int y;
- void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
- int width, uint32 value) = ARGBShadeRow_C;
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) = ARGBShadeRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
return -1;
}
@@ -1830,8 +2307,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -1846,6 +2322,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
ARGBShadeRow = ARGBShadeRow_NEON;
}
#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -1857,12 +2338,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
// Interpolate 2 planes by specified amount (0 to 255).
LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
- const uint8* src1, int src_stride1,
- uint8* dst, int dst_stride,
- int width, int height, int interpolation) {
+int InterpolatePlane(const uint8* src0,
+ int src_stride0,
+ const uint8* src1,
+ int src_stride1,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation) {
int y;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -1875,9 +2361,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
dst_stride = -dst_stride;
}
// Coalesce rows.
- if (src_stride0 == width &&
- src_stride1 == width &&
- dst_stride == width) {
+ if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
width *= height;
height = 1;
src_stride0 = src_stride1 = dst_stride = 0;
@@ -1907,14 +2391,21 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
- IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
- IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
- IS_ALIGNED(width, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) &&
+ IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) &&
+ IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) &&
+ IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -1927,61 +2418,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int interpolation) {
- return InterpolatePlane(src_argb0, src_stride_argb0,
- src_argb1, src_stride_argb1,
- dst_argb, dst_stride_argb,
+int ARGBInterpolate(const uint8* src_argb0,
+ int src_stride_argb0,
+ const uint8* src_argb1,
+ int src_stride_argb1,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation) {
+ return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+ src_stride_argb1, dst_argb, dst_stride_argb,
width * 4, height, interpolation);
}
// Interpolate 2 YUV images by specified amount (0 to 255).
LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
- const uint8* src0_u, int src0_stride_u,
- const uint8* src0_v, int src0_stride_v,
- const uint8* src1_y, int src1_stride_y,
- const uint8* src1_u, int src1_stride_u,
- const uint8* src1_v, int src1_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height, int interpolation) {
+int I420Interpolate(const uint8* src0_y,
+ int src0_stride_y,
+ const uint8* src0_u,
+ int src0_stride_u,
+ const uint8* src0_v,
+ int src0_stride_v,
+ const uint8* src1_y,
+ int src1_stride_y,
+ const uint8* src1_u,
+ int src1_stride_u,
+ const uint8* src1_v,
+ int src1_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src0_y || !src0_u || !src0_v ||
- !src1_y || !src1_u || !src1_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+ !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
- InterpolatePlane(src0_y, src0_stride_y,
- src1_y, src1_stride_y,
- dst_y, dst_stride_y,
- width, height, interpolation);
- InterpolatePlane(src0_u, src0_stride_u,
- src1_u, src1_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight, interpolation);
- InterpolatePlane(src0_v, src0_stride_v,
- src1_v, src1_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight, interpolation);
+ InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+ dst_stride_y, width, height, interpolation);
+ InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+ dst_stride_u, halfwidth, halfheight, interpolation);
+ InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+ dst_stride_v, halfwidth, halfheight, interpolation);
return 0;
}
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- const uint8* shuffler, int width, int height) {
+int ARGBShuffle(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ const uint8* shuffler,
+ int width,
+ int height) {
int y;
void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
const uint8* shuffler, int width) = ARGBShuffleRow_C;
- if (!src_bgra || !dst_argb ||
- width <= 0 || height == 0) {
+ if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1991,8 +2492,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_stride_bgra = -src_stride_bgra;
}
// Coalesce rows.
- if (src_stride_bgra == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_bgra = dst_stride_argb = 0;
@@ -2029,6 +2529,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
}
#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -2039,28 +2547,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
// Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height,
+static int ARGBSobelize(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
void (*SobelRow)(const uint8* src_sobelx,
const uint8* src_sobely,
- uint8* dst, int width)) {
+ uint8* dst,
+ int width)) {
int y;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
ARGBToYJRow_C;
- void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) = SobelYRow_C;
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely,
+ int width) = SobelYRow_C;
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobely, int width) =
SobelXRow_C;
const int kEdge = 16; // Extra pixels at start of row for extrude/align.
- if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
@@ -2088,6 +2600,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2159,9 +2679,12 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
// Sobel ARGB effect.
LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBSobel(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
@@ -2180,15 +2703,26 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_SOBELROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelRow = SobelRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_MSA;
+ }
+ }
+#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelRow);
}
// Sobel ARGB effect with planar output.
LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
+int ARGBSobelToPlane(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
@@ -2207,16 +2741,27 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
}
}
#endif
- return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
- width, height, SobelToPlaneRow);
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+ height, SobelToPlaneRow);
}
// SobelXY ARGB effect.
// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBSobelXY(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
@@ -2235,32 +2780,41 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
}
}
#endif
+#if defined(HAS_SOBELXYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXYRow = SobelXYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_MSA;
+ }
+ }
+#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelXYRow);
}
// Apply a 4x4 polynomial to each ARGB pixel.
LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const float* poly,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*ARGBPolynomialRow)(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) = ARGBPolynomialRow_C;
+ void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb,
+ const float* poly, int width) = ARGBPolynomialRow_C;
if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -2285,28 +2839,103 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
return 0;
}
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y,
+ int src_stride_y,
+ uint16* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height) {
+ int y;
+ void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
+ HalfFloatRow_C;
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ src_stride_y >>= 1;
+ dst_stride_y >>= 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_HALFFLOATROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ HalfFloatRow = HalfFloatRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = HalfFloatRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HalfFloatRow = HalfFloatRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = HalfFloatRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ HalfFloatRow(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
// Apply a lumacolortable to each ARGB pixel.
LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
+int ARGBLumaColorTable(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
const uint8* luma,
- int width, int height) {
+ int width,
+ int height) {
int y;
- void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
- int width, const uint8* luma, const uint32 lumacoeff) =
- ARGBLumaColorTableRow_C;
+ void (*ARGBLumaColorTableRow)(
+ const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma,
+ const uint32 lumacoeff) = ARGBLumaColorTableRow_C;
if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -2327,9 +2956,12 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
// Copy Alpha from one ARGB image to another.
LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBCopyAlpha(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
ARGBCopyAlphaRow_C;
@@ -2343,8 +2975,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride_argb == width * 4 &&
- dst_stride_argb == width * 4) {
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_argb = dst_stride_argb = 0;
@@ -2376,9 +3007,12 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
// Extract just the alpha channel from ARGB.
LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
- uint8* dst_a, int dst_stride,
- int width, int height) {
+int ARGBExtractAlpha(const uint8* src_argb,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride,
+ int width,
+ int height) {
if (!src_argb || !dst_a || width <= 0 || height == 0) {
return -1;
}
@@ -2394,7 +3028,7 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
height = 1;
src_stride = dst_stride = 0;
}
- void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
+ void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) =
ARGBExtractAlphaRow_C;
#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2402,6 +3036,12 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
: ARGBExtractAlphaRow_Any_SSE2;
}
#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+ : ARGBExtractAlphaRow_Any_AVX2;
+ }
+#endif
#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
@@ -2419,9 +3059,12 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
// Copy a planar Y channel to the alpha channel of a destination ARGB image.
LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+int ARGBCopyYToAlpha(const uint8* src_y,
+ int src_stride_y,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
ARGBCopyYToAlphaRow_C;
@@ -2435,8 +3078,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = -src_stride_y;
}
// Coalesce rows.
- if (src_stride_y == width &&
- dst_stride_argb == width * 4) {
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = dst_stride_argb = 0;
@@ -2470,20 +3112,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
// directly. A SplitUVRow_Odd function could copy the remaining chroma.
LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
+int YUY2ToNV12(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- if (!src_yuy2 ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
+ if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -2540,6 +3184,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
@@ -2568,20 +3220,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
}
LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height) {
+int UYVYToNV12(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- if (!src_uyvy ||
- !dst_y || !dst_uv ||
- width <= 0 || height == 0) {
+ if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -2638,6 +3292,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index 01ea5c40..277c53b2 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -22,12 +22,20 @@ extern "C" {
#endif
LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+void TransposePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
int i = height;
- void (*TransposeWx8)(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst,
+ int dst_stride, int width) = TransposeWx16_C;
+#else
+ void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
+ int dst_stride, int width) = TransposeWx8_C;
+#endif
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
@@ -51,22 +59,40 @@ void TransposePlane(const uint8* src, int src_stride,
#endif
#if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
- if (IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_DSPR2;
} else {
TransposeWx8 = TransposeWx8_DSPR2;
}
}
#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ // Work across the source in 16x16 tiles
+ while (i >= 16) {
+ TransposeWx16(src, src_stride, dst, dst_stride, width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst += 16; // Move over 16 columns.
+ i -= 16;
+ }
+#else
// Work across the source in 8x8 tiles
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
- src += 8 * src_stride; // Go down 8 rows.
- dst += 8; // Move over 8 columns.
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
i -= 8;
}
+#endif
if (i > 0) {
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@@ -74,9 +100,12 @@ void TransposePlane(const uint8* src, int src_stride,
}
LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+void RotatePlane90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Rotate by 90 is a transpose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@@ -86,9 +115,12 @@ void RotatePlane90(const uint8* src, int src_stride,
}
LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+void RotatePlane270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Rotate by 270 is a transpose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@@ -98,9 +130,12 @@ void RotatePlane270(const uint8* src, int src_stride,
}
LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+void RotatePlane180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width);
const uint8* src_bot = src + src_stride * (height - 1);
@@ -135,12 +170,20 @@ void RotatePlane180(const uint8* src, int src_stride,
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_DSPR2;
}
#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -181,15 +224,24 @@ void RotatePlane180(const uint8* src, int src_stride,
}
LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
+void TransposeUV(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
int i = height;
- void (*TransposeUVWx8)(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a,
+ int dst_stride_a, uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
+#else
+ void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
+ int dst_stride_a, uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
+#endif
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -204,68 +256,92 @@ void TransposeUV(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_DSPR2;
}
#endif
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
+#else
// Work through the source in 8x8 tiles.
while (i >= 8) {
- TransposeUVWx8(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
+ TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width);
- src += 8 * src_stride; // Go down 8 rows.
- dst_a += 8; // Move over 8 columns.
- dst_b += 8; // Move over 8 columns.
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
i -= 8;
}
+#endif
if (i > 0) {
- TransposeUVWxH_C(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
+ TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
width, i);
}
}
LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
+void RotateUV90(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
src += src_stride * (height - 1);
src_stride = -src_stride;
- TransposeUV(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, height);
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
}
LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
+void RotateUV270(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
dst_a += dst_stride_a * (width - 1);
dst_b += dst_stride_b * (width - 1);
dst_stride_a = -dst_stride_a;
dst_stride_b = -dst_stride_b;
- TransposeUV(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, height);
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
}
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
+void RotateUV180(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
int i;
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
@@ -280,8 +356,8 @@ void RotateUV180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_MIRRORUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
@@ -298,9 +374,12 @@ void RotateUV180(const uint8* src, int src_stride,
}
LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height,
+int RotatePlane(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height,
enum RotationMode mode) {
if (!src || width <= 0 || height == 0 || !dst) {
return -1;
@@ -316,24 +395,16 @@ int RotatePlane(const uint8* src, int src_stride,
switch (mode) {
case kRotate0:
// copy frame
- CopyPlane(src, src_stride,
- dst, dst_stride,
- width, height);
+ CopyPlane(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate90:
- RotatePlane90(src, src_stride,
- dst, dst_stride,
- width, height);
+ RotatePlane90(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate270:
- RotatePlane270(src, src_stride,
- dst, dst_stride,
- width, height);
+ RotatePlane270(src, src_stride, dst, dst_stride, width, height);
return 0;
case kRotate180:
- RotatePlane180(src, src_stride,
- dst, dst_stride,
- width, height);
+ RotatePlane180(src, src_stride, dst, dst_stride, width, height);
return 0;
default:
break;
@@ -342,18 +413,25 @@ int RotatePlane(const uint8* src, int src_stride,
}
LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
+int I420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
- !dst_y || !dst_u || !dst_v) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
return -1;
}
@@ -372,45 +450,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
- return I420Copy(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
+ return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height);
case kRotate90:
- RotatePlane90(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane90(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane90(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
return 0;
case kRotate270:
- RotatePlane270(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane270(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane270(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
return 0;
case kRotate180:
- RotatePlane180(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotatePlane180(src_u, src_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight);
- RotatePlane180(src_v, src_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
return 0;
default:
break;
@@ -419,17 +481,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
}
LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
+int NV12ToI420Rotate(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_uv,
+ int src_stride_uv,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_uv || width <= 0 || height == 0 ||
- !dst_y || !dst_u || !dst_v) {
+ if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+ !dst_v) {
return -1;
}
@@ -446,38 +514,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
- return NV12ToI420(src_y, src_stride_y,
- src_uv, src_stride_uv,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
+ return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
width, height);
case kRotate90:
- RotatePlane90(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV90(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate270:
- RotatePlane270(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV270(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate180:
- RotatePlane180(src_y, src_stride_y,
- dst_y, dst_stride_y,
- width, height);
- RotateUV180(src_uv, src_stride_uv,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- halfwidth, halfheight);
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
default:
break;
diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
index 31a74c31..562096b9 100644
--- a/files/source/rotate_any.cc
+++ b/files/source/rotate_any.cc
@@ -18,16 +18,16 @@ namespace libyuv {
extern "C" {
#endif
-#define TANY(NAMEANY, TPOS_SIMD, MASK) \
- void NAMEANY(const uint8* src, int src_stride, \
- uint8* dst, int dst_stride, int width) { \
- int r = width & MASK; \
- int n = width - r; \
- if (n > 0) { \
- TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
- } \
- TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
- }
+#define TANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \
+ int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
+ } \
+ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+ }
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@@ -41,22 +41,22 @@ TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#ifdef HAS_TRANSPOSEWX8_DSPR2
TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#endif
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
+#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
- void NAMEANY(const uint8* src, int src_stride, \
- uint8* dst_a, int dst_stride_a, \
- uint8* dst_b, int dst_stride_b, int width) { \
- int r = width & MASK; \
- int n = width - r; \
- if (n > 0) { \
- TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
- n); \
- } \
- TransposeUVWx8_C(src + n * 2, src_stride, \
- dst_a + n * dst_stride_a, dst_stride_a, \
- dst_b + n * dst_stride_b, dst_stride_b, r); \
- }
+ void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \
+ int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+ } \
+ TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
+ dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+ }
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@@ -67,14 +67,12 @@ TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
#endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
#undef TUVANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-
-
-
-
-
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index 787c0ad1..b458d8fa 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -22,29 +22,44 @@ extern "C" {
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || \
- (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) && !defined(__native_client__)) || \
+ defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
- int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
+ int src_stride,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
- int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
+ int src_stride,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
#endif
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
- int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_ptr,
+ int,
+ int src_stepx,
+ uint8* dst_ptr,
+ int dst_width);
-static void ARGBTranspose(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width, int height) {
+static void ARGBTranspose(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
- int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+ int src_step, uint8* dst_ptr, int dst_width) =
+ ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
@@ -63,8 +78,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
}
}
-void ARGBRotate90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate90(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@@ -73,8 +92,12 @@ void ARGBRotate90(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
-void ARGBRotate270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate270(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@@ -83,8 +106,12 @@ void ARGBRotate270(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
-void ARGBRotate180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate180(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
@@ -118,6 +145,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -146,9 +181,9 @@ void ARGBRotate180(const uint8* src, int src_stride,
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
- ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
- CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
src += src_stride;
dst += dst_stride;
src_bot -= src_stride;
@@ -158,8 +193,12 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb, int width, int height,
+int ARGBRotate(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;
@@ -175,23 +214,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
switch (mode) {
case kRotate0:
// copy frame
- return ARGBCopy(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
+ return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
+ ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
return 0;
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
+ ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
return 0;
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb,
- dst_argb, dst_stride_argb,
- width, height);
+ ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
return 0;
default:
break;
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
index b33a9a0c..cdd231fa 100644
--- a/files/source/rotate_common.cc
+++ b/files/source/rotate_common.cc
@@ -16,8 +16,11 @@ namespace libyuv {
extern "C" {
#endif
-void TransposeWx8_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
+void TransposeWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
@@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
}
}
-void TransposeUVWx8_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width) {
+void TransposeUVWx8_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
@@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
}
}
-void TransposeWxH_C(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height) {
+void TransposeWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width,
+ int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
@@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
}
}
-void TransposeUVWxH_C(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height) {
+void TransposeUVWxH_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
diff --git a/files/source/rotate_mips.cc b/files/source/rotate_dspr2.cc
index 1e8ce251..2dce9107 100644
--- a/files/source/rotate_mips.cc
+++ b/files/source/rotate_dspr2.cc
@@ -18,18 +18,20 @@ namespace libyuv {
extern "C" {
#endif
-#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- __asm__ __volatile__ (
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
- "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
@@ -38,8 +40,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
-//dst + dst_stride word aligned
- "1: \n"
+ // dst + dst_stride word aligned
+ "1: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@@ -65,8 +67,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"bnez %[width], 1b \n"
" addu %[dst], %[dst], %[dst_stride] \n"
"b 2f \n"
-//dst + dst_stride unaligned
- "11: \n"
+ // dst + dst_stride unaligned
+ "11: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
@@ -92,23 +94,20 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
"swr $s1, 4(%[dst]) \n"
"swl $s1, 7(%[dst]) \n"
"bnez %[width], 11b \n"
- "addu %[dst], %[dst], %[dst_stride] \n"
- "2: \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
".set pop \n"
- :[src] "+r" (src),
- [dst] "+r" (dst),
- [width] "+r" (width)
- :[src_stride] "r" (src_stride),
- [dst_stride] "r" (dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1"
- );
+ : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+ : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
}
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- __asm__ __volatile__ (
+void TransposeWx8_Fast_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ __asm__ __volatile__(
".set noat \n"
".set push \n"
".set noreorder \n"
@@ -126,67 +125,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
-//dst + dst_stride word aligned
+ // dst + dst_stride word aligned
"1: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
+ // t0 = | 30 | 20 | 10 | 00 |
+ // t1 = | 31 | 21 | 11 | 01 |
+ // t8 = | 32 | 22 | 12 | 02 |
+ // t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
+ // t0 = | 34 | 24 | 14 | 04 |
+ // t1 = | 35 | 25 | 15 | 05 |
+ // t8 = | 36 | 26 | 16 | 06 |
+ // t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@@ -207,67 +206,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"bnez $AT, 1b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"b 2f \n"
-//dst + dst_stride unaligned
+ // dst + dst_stride unaligned
"11: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
+ // t0 = | 30 | 20 | 10 | 00 |
+ // t1 = | 31 | 21 | 11 | 01 |
+ // t8 = | 32 | 22 | 12 | 02 |
+ // t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
+ // t0 = | 34 | 24 | 14 | 04 |
+ // t1 = | 35 | 25 | 15 | 05 |
+ // t8 = | 36 | 26 | 16 | 06 |
+ // t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
@@ -298,34 +297,33 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
".set at \n"
- :[src] "+r" (src),
- [dst] "+r" (dst),
- [width] "+r" (width)
- :[src_stride] "r" (src_stride),
- [dst_stride] "r" (dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
- );
+ : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+ : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+ "s2", "s3", "s4", "s5", "s6", "s7");
}
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_DSPR2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
int width) {
- __asm__ __volatile__ (
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"subu $t7, $t9, %[src_stride] \n"
"srl $t1, %[width], 1 \n"
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
"andi $t0, %[dst_a], 0x3 \n"
"andi $t8, %[dst_b], 0x3 \n"
"or $t0, $t0, $t8 \n"
@@ -335,52 +333,52 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"or $t0, $t0, $t8 \n"
"bnez $t0, 11f \n"
" nop \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
- "1: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ // dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"sw $s3, 0($s5) \n"
"sw $s4, 0($s6) \n"
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"sw $s3, 0(%[dst_a]) \n"
"sw $s4, 0(%[dst_b]) \n"
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"sw $s3, 4($s5) \n"
"sw $s4, 4($s6) \n"
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@@ -394,59 +392,59 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"b 2f \n"
" nop \n"
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
- "11: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"swr $s3, 0($s5) \n"
"swl $s3, 3($s5) \n"
"swr $s4, 0($s6) \n"
"swl $s4, 3($s6) \n"
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"swr $s3, 0(%[dst_a]) \n"
"swl $s3, 3(%[dst_a]) \n"
"swr $s4, 0(%[dst_b]) \n"
"swl $s4, 3(%[dst_b]) \n"
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"swr $s3, 4($s5) \n"
"swl $s3, 7($s5) \n"
"swr $s4, 4($s6) \n"
"swl $s4, 7($s6) \n"
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
@@ -462,18 +460,11 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
"2: \n"
".set pop \n"
- : [src] "+r" (src),
- [dst_a] "+r" (dst_a),
- [dst_b] "+r" (dst_b),
- [width] "+r" (width),
- [src_stride] "+r" (src_stride)
- : [dst_stride_a] "r" (dst_stride_a),
- [dst_stride_b] "r" (dst_stride_b)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
+ : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
+ [width] "+r"(width), [src_stride] "+r"(src_stride)
+ : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+ "s2", "s3", "s4", "s5", "s6");
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
index cbe870ca..85b41dd8 100644
--- a/files/source/rotate_gcc.cc
+++ b/files/source/rotate_gcc.cc
@@ -22,342 +22,348 @@ extern "C" {
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
- );
+void TransposeWx8_Fast_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm15");
}
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width) {
- asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
- // Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(src_stride)), // %4
- "r"((intptr_t)(dst_stride_a)), // %5
- "r"((intptr_t)(dst_stride_b)) // %6
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9"
- );
+void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9");
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif // defined(__x86_64__) || defined(__i386__)
diff --git a/files/source/rotate_msa.cc b/files/source/rotate_msa.cc
new file mode 100644
index 00000000..8907765a
--- /dev/null
+++ b/files/source/rotate_msa.cc
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \
+ out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \
+ out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \
+ out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \
+ }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \
+ out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \
+ out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \
+ out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \
+ }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \
+ out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \
+ out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \
+ out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \
+ }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \
+ out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \
+ out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \
+ out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
+ }
+
+void TransposeWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+ TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+ width);
+}
+
+void TransposeUVWx16_C(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+ dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
+ int x;
+ const uint8* s;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < width; x += 16) {
+ s = src;
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+ ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+ ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+ ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ dst += dst_stride * 4;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+ ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+ src += 16;
+ dst += dst_stride * 4;
+ }
+}
+
+void TransposeUVWx16_MSA(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int width) {
+ int x;
+ const uint8* s;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < width; x += 8) {
+ s = src;
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ s += src_stride;
+ ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+ ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+ ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+ ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+ res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+ ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+ ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+ src += 16;
+ dst_a += dst_stride_a * 2;
+ dst_b += dst_stride_b * 2;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index 1c22b472..41ec34ec 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -21,11 +21,13 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
-static uvec8 kVTbl4x4Transpose =
- { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
@@ -240,12 +242,15 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
-static uvec8 kVTbl4x4TransposeDi =
- { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
+static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
index 1ab448f3..3cf17930 100644
--- a/files/source/rotate_neon64.cc
+++ b/files/source/rotate_neon64.cc
@@ -21,13 +21,16 @@ extern "C" {
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-static uvec8 kVTbl4x4Transpose =
- { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
+static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
const uint8* src_temp;
- int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ int64 width64 = (int64)width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@@ -247,16 +250,19 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
-static uint8 kVTbl4x4TransposeDi[32] =
- { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
- 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+static uint8 kVTbl4x4TransposeDi[32] = {
+ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+ 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
int width) {
const uint8* src_temp;
- int64 width64 = (int64) width; // Work around clang 3.4 warning.
+ int64 width64 = (int64)width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
index 1300fc0f..201643e7 100644
--- a/files/source/rotate_win.cc
+++ b/files/source/rotate_win.cc
@@ -19,15 +19,17 @@ extern "C" {
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
+__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
+ int src_stride,
+ uint8* dst,
+ int dst_stride,
+ int width) {
__asm {
push edi
push esi
push ebp
- mov eax, [esp + 12 + 4] // src
- mov edi, [esp + 12 + 8] // src_stride
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
@@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
}
}
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int w) {
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
+ int src_stride,
+ uint8* dst_a,
+ int dst_stride_a,
+ uint8* dst_b,
+ int dst_stride_b,
+ int w) {
__asm {
push ebx
push esi
push edi
push ebp
- mov eax, [esp + 16 + 4] // src
- mov edi, [esp + 16 + 8] // src_stride
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
@@ -134,8 +138,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
align 4
convertloop:
- // Read in the data from the source pointer.
- // First round of bit swap.
+ // Read in the data from the source pointer.
+ // First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
@@ -162,7 +166,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
- movdqa xmm5, xmm6 // use xmm5 as temp register.
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
@@ -183,7 +187,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
- movdqa xmm6, xmm5 // use xmm6 as temp register.
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
@@ -200,7 +204,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
@@ -209,7 +213,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
@@ -218,7 +222,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
- movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
index 494164fd..74a6621f 100644
--- a/files/source/row_any.cc
+++ b/files/source/row_any.cc
@@ -23,26 +23,26 @@ extern "C" {
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- const uint8* a_buf, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 5]); \
- memset(temp, 0, 64 * 4); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 192, a_buf + n, r); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
- SS(r, DUVSHIFT) * BPP); \
- }
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ const uint8* a_buf, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
@@ -53,26 +53,29 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
#undef ANY41C
// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
- }
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@@ -80,9 +83,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
@@ -95,35 +104,31 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- if (width & 1) { \
- temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
- temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
- } \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
- }
+ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+ uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
-#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
@@ -144,9 +149,6 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#ifdef HAS_I444TOARGBROW_AVX2
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
-#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
@@ -159,32 +161,46 @@ ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#ifdef HAS_I422TOARGBROW_NEON
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
+#ifdef HAS_I422TOARGBROW_DSPR2
+ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
#undef ANY31C
// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
- uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
// Merge functions.
#ifdef HAS_MERGEUVROW_SSE2
@@ -196,6 +212,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
#ifdef HAS_MERGEUVROW_NEON
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
// Math functions.
#ifdef HAS_ARGBMULTIPLYROW_SSE2
@@ -225,44 +244,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
#ifdef HAS_ARGBSUBTRACTROW_NEON
ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
#ifdef HAS_SOBELROW_NEON
ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
#ifdef HAS_SOBELTOPLANEROW_SSE2
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
#endif
#ifdef HAS_SOBELTOPLANEROW_NEON
ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
#ifdef HAS_SOBELXYROW_SSE2
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
#endif
#ifdef HAS_SOBELXYROW_NEON
ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
#undef ANY21
// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
- uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
@@ -274,6 +310,12 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV12TOARGBROW_DSPR2
+ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
@@ -283,6 +325,9 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#ifdef HAS_NV21TOARGBROW_NEON
ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
@@ -292,22 +337,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
#undef ANY21C
// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
- memset(temp, 0, 128); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
#ifdef HAS_COPYROW_AVX
ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@@ -372,9 +420,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
@@ -403,30 +463,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
#ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
#ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
#ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
#ifdef HAS_ARGB1555TOYROW_NEON
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
#ifdef HAS_ARGB4444TOYROW_NEON
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
#endif
@@ -434,23 +521,71 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#endif
#ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
#ifdef HAS_RGB565TOARGBROW_NEON
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#endif
+#ifdef HAS_RGB24TOARGBROW_DSPR2
+ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_DSPR2
+ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_DSPR2
+ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_DSPR2
+ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_DSPR2
+ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_BGRATOYROW_DSPR2
+ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_DSPR2
+ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_DSPR2
+ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_DSPR2
+ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
@@ -466,29 +601,35 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
+#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#endif
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
- memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
- ANY_SIMD(temp, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
@@ -505,32 +646,51 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#undef ANY11B
// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
- T shuffler, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 2]); \
- memset(temp, 0, 64); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
- }
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T shuffler, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
- const uint32, 4, 2, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+ ARGBToRGB565DitherRow_SSE2,
+ const uint32,
+ 4,
+ 2,
+ 3)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
- const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+ ARGBToRGB565DitherRow_AVX2,
+ const uint32,
+ 4,
+ 2,
+ 7)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
- const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+ ARGBToRGB565DitherRow_NEON,
+ const uint32,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+ ARGBToRGB565DitherRow_MSA,
+ const uint32,
+ 4,
+ 2,
+ 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSE2
ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
@@ -544,23 +704,58 @@ ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
#ifdef HAS_ARGBSHUFFLEROW_NEON
ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
+#endif
#undef ANY11P
+// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T shuffler, \
+ int width) { \
+ SIMD_ALIGNED(uint16 temp[32 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
+ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
+#endif
+#undef ANY11P16
+
// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
- memset(temp, 0, 128); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -573,25 +768,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
- ptrdiff_t src_stride_ptr, int width, \
- int source_y_fraction) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \
+ int width, int source_y_fraction) { \
+ SIMD_ALIGNED(uint8 temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
#ifdef HAS_INTERPOLATEROW_AVX2
ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@@ -605,22 +803,25 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#ifdef HAS_INTERPOLATEROW_DSPR2
ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
#undef ANY11T
// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 2]); \
- memset(temp, 0, 64); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr, r * BPP); \
- ANY_SIMD(temp, temp + 64, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
- }
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr, r* BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+ }
#ifdef HAS_MIRRORROW_AVX2
ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@@ -631,6 +832,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -640,20 +844,23 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#ifdef HAS_ARGBMIRRORROW_NEON
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
#undef ANY11M
// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, T v32, int width) { \
- SIMD_ALIGNED(uint8 temp[64]); \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, v32, n); \
- } \
- ANY_SIMD(temp, v32, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp, r * BPP); \
- }
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8 temp[64]); \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(temp, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp, r * BPP); \
+ }
#ifdef HAS_SETROW_X86
ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
@@ -664,43 +871,26 @@ ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
#ifdef HAS_ARGBSETROW_NEON
ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3)
+#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
- SIMD_ALIGNED(uint8 temp[128 * 3]); \
- memset(temp, 0, 128); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_u, dst_v, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- /* repeat last 4 bytes for 422 subsampler */ \
- if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
- /* repeat last 4 - 12 bytes for 411 subsampler */ \
- if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
- } \
- if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
- } \
- if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
- ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
- memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
- memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
- }
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 3]); \
+ memset(temp, 0, 128); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
+ }
#ifdef HAS_SPLITUVROW_SSE2
ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@@ -727,37 +917,41 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
#undef ANY12
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \
- uint8* dst_u, uint8* dst_v, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 4]); \
- memset(temp, 0, 128 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
- SS(r, UVSHIFT) * BPP); \
- if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
- temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
- ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
- memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
- memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
- }
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \
+ uint8* dst_v, int width) { \
+ SIMD_ALIGNED(uint8 temp[128 * 4]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
+ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
+ memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
+ }
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
@@ -783,30 +977,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
#ifdef HAS_ARGBTOUVROW_NEON
ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
#ifdef HAS_RGB565TOUVROW_NEON
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
#endif
@@ -816,6 +1037,24 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#ifdef HAS_UYVYTOUVROW_NEON
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#endif
+#ifdef HAS_BGRATOUVROW_DSPR2
+ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_DSPR2
+ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_DSPR2
+ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_DSPR2
+ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
#undef ANY12S
#ifdef __cplusplus
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index 32d2f686..bf953eef 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -40,7 +40,7 @@ static __inline uint32 Abs(int32 v) {
int m = v >> 31;
return (v + m) ^ m;
}
-#else // USE_BRANCHLESS
+#else // USE_BRANCHLESS
static __inline int32 clamp0(int32 v) {
return (v < 0) ? 0 : v;
}
@@ -129,7 +129,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
}
}
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555,
+ uint8* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -146,7 +147,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
}
}
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444,
+ uint8* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -200,8 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 b1 = src_argb[4] >> 3;
uint8 g1 = src_argb[5] >> 2;
uint8 r1 = src_argb[6] >> 3;
- WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27));
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
@@ -221,8 +223,10 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
// endian will not affect order of the original matrix. But the dither4
// will containing the first pixel in the lower byte for little endian
// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_C(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
int dither0 = ((const unsigned char*)(&dither4))[x & 3];
@@ -233,8 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
- WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27));
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
@@ -258,9 +262,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 g1 = src_argb[5] >> 3;
uint8 r1 = src_argb[6] >> 3;
uint8 a1 = src_argb[7] >> 7;
- *(uint32*)(dst_rgb) =
- b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
dst_rgb += 4;
src_argb += 8;
}
@@ -269,8 +272,7 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 g0 = src_argb[1] >> 3;
uint8 r0 = src_argb[2] >> 3;
uint8 a0 = src_argb[3] >> 7;
- *(uint16*)(dst_rgb) =
- b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
}
}
@@ -285,9 +287,8 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 g1 = src_argb[5] >> 4;
uint8 r1 = src_argb[6] >> 4;
uint8 a1 = src_argb[7] >> 4;
- *(uint32*)(dst_rgb) =
- b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | (a1 << 28);
dst_rgb += 4;
src_argb += 8;
}
@@ -296,13 +297,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 g0 = src_argb[1] >> 4;
uint8 r0 = src_argb[2] >> 4;
uint8 a0 = src_argb[3] >> 4;
- *(uint16*)(dst_rgb) =
- b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
}
}
static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
}
static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
@@ -312,41 +312,45 @@ static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
-} \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
- src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
- uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
- src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
- uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
- src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
-}
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP]) >> \
+ 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP]) >> \
+ 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP]) >> \
+ 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+ }
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -382,7 +386,7 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// r 0.50000 * 255 = 127.5 = 127
static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
- return (38 * r + 75 * g + 15 * b + 64) >> 7;
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
}
static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
@@ -394,41 +398,42 @@ static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
-} \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
- AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
- uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
- AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
- uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
- AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
- uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
- uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- } \
-}
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+ }
MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ
@@ -478,8 +483,11 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
}
}
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_C(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -525,8 +533,11 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
}
}
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -573,8 +584,11 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
}
}
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -622,7 +636,9 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
}
void ARGBToUV444Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
uint8 ab = src_argb[0];
@@ -636,41 +652,6 @@ void ARGBToUV444Row_C(const uint8* src_argb,
}
}
-void ARGBToUV411Row_C(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- int x;
- for (x = 0; x < width - 3; x += 4) {
- uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
- uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
- uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb += 16;
- dst_u += 1;
- dst_v += 1;
- }
- // Odd width handling mimics 'any' function which replicates last pixel.
- if ((width & 3) == 3) {
- uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
- uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
- uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- } else if ((width & 3) == 2) {
- uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
- uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
- uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- } else if ((width & 3) == 1) {
- uint8 ab = src_argb[0];
- uint8 ag = src_argb[1];
- uint8 ar = src_argb[2];
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- }
-}
-
void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -702,22 +683,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
// Apply color matrix to a row of image. Matrix is signed.
// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
int b = src_argb[0];
int g = src_argb[1];
int r = src_argb[2];
int a = src_argb[3];
- int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
- r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
- int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
- r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
- int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
- r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
- int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
- r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+ a * matrix_argb[3]) >>
+ 6;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+ a * matrix_argb[7]) >>
+ 6;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+ a * matrix_argb[11]) >>
+ 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+ a * matrix_argb[15]) >>
+ 6;
dst_argb[0] = Clamp(sb);
dst_argb[1] = Clamp(sg);
dst_argb[2] = Clamp(sr);
@@ -757,8 +744,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
}
}
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
+void ARGBQuantizeRow_C(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
int b = dst_argb[0];
@@ -772,9 +762,11 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
}
#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
+#define SHADE(f, v) v* f >> 24
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value) {
const uint32 b_scale = REPEAT8(value & 0xff);
const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
@@ -799,10 +791,12 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
#undef SHADE
#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
+#define SHADE(f, v) v* f >> 16
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32 b = REPEAT8(src_argb0[0]);
@@ -827,8 +821,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
#define SHADE(f, v) clamp255(v + f)
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBAddRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
const int b = src_argb0[0];
@@ -852,8 +848,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
#define SHADE(f, v) clamp0(f - v)
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBSubtractRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
const int b = src_argb0[0];
@@ -876,8 +874,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
#undef SHADE
// Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
- uint8* dst_sobelx, int width) {
+void SobelXRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
int a = src_y0[i];
@@ -894,8 +895,10 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
}
}
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_C(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
int a = src_y0[i + 0];
@@ -912,8 +915,10 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
}
}
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
@@ -927,8 +932,10 @@ void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
}
}
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
+void SobelToPlaneRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
@@ -938,8 +945,10 @@ void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
}
}
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelXYRow_C(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
@@ -974,75 +983,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
#define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
+#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__) // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__) // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
#else
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
#endif
#undef BB
@@ -1062,74 +1065,68 @@ const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
// Y contribution to R,G,B. Scale and bias.
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+#define YGB 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414 * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
+#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
#else
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
#endif
#undef BB
@@ -1143,81 +1140,76 @@ const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
#undef YG
// BT.709 YUV to RGB reference
-// * R = Y - V * -1.28033
-// * G = Y - U * 0.21482 - V * 0.38059
-// * B = Y - U * -2.12798
+// R = (Y - 16) * 1.164 - V * -1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
// Y contribution to R,G,B. Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059 * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR -115 /* round(-1.793 * 64) */
// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
+#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, 0, 0, 0, 0, 0},
+ {0x0101 * YG, 0, 0, 0}};
#else
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
#endif
#undef BB
@@ -1231,8 +1223,12 @@ const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
#undef YG
// C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
- uint8* b, uint8* g, uint8* r,
+static __inline void YuvPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+ uint8* b,
+ uint8* g,
+ uint8* r,
const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
@@ -1264,13 +1260,13 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
#endif
uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32)(-(u * ub ) + y1 + bb) >> 6);
+ *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
*g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
- *r = Clamp((int32)(-( v * vr) + y1 + br) >> 6);
+ *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
}
// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// C reference code that mimics the YUV assembly.
@@ -1310,8 +1306,8 @@ void I444ToARGBRow_C(const uint8* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1324,8 +1320,8 @@ void I444ToARGBRow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
@@ -1344,11 +1340,11 @@ void I422ToARGBRow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
@@ -1356,8 +1352,8 @@ void I422ToARGBRow_C(const uint8* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1371,11 +1367,11 @@ void I422AlphaToARGBRow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 1;
@@ -1384,8 +1380,8 @@ void I422AlphaToARGBRow_C(const uint8* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
@@ -1398,18 +1394,18 @@ void I422ToRGB24Row_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
}
}
@@ -1435,8 +1431,8 @@ void I422ToARGB4444Row_C(const uint8* src_y,
b1 = b1 >> 4;
g1 = g1 >> 4;
r1 = r1 >> 4;
- *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+ *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | 0xf000f000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1447,8 +1443,7 @@ void I422ToARGB4444Row_C(const uint8* src_y,
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
- *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
- 0xf000;
+ *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
}
}
@@ -1474,8 +1469,8 @@ void I422ToARGB1555Row_C(const uint8* src_y,
b1 = b1 >> 3;
g1 = g1 >> 3;
r1 = r1 >> 3;
- *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+ *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+ (g1 << 21) | (r1 << 26) | 0x80008000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1486,8 +1481,7 @@ void I422ToARGB1555Row_C(const uint8* src_y,
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
- *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
- 0x8000;
+ *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
}
}
@@ -1513,8 +1507,8 @@ void I422ToRGB565Row_C(const uint8* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint32*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1529,48 +1523,6 @@ void I422ToRGB565Row_C(const uint8* src_y,
}
}
-void I411ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- for (x = 0; x < width - 3; x += 4) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
- rgb_buf[7] = 255;
- YuvPixel(src_y[2], src_u[0], src_v[0],
- rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
- rgb_buf[11] = 255;
- YuvPixel(src_y[3], src_u[0], src_v[0],
- rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
- rgb_buf[15] = 255;
- src_y += 4;
- src_u += 1;
- src_v += 1;
- rgb_buf += 16; // Advance 4 pixels.
- }
- if (width & 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
- rgb_buf[7] = 255;
- src_y += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
- }
-}
-
void NV12ToARGBRow_C(const uint8* src_y,
const uint8* src_uv,
uint8* rgb_buf,
@@ -1578,19 +1530,19 @@ void NV12ToARGBRow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_uv[0], src_uv[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_uv[0], src_uv[1],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_uv += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_uv[0], src_uv[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1602,19 +1554,19 @@ void NV21ToARGBRow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel(src_y[1], src_vu[1], src_vu[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_vu += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1640,8 +1592,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
- (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint32*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2;
src_uv += 2;
dst_rgb565 += 4; // Advance 2 pixels.
@@ -1661,18 +1613,18 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_yuy2 += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1683,18 +1635,18 @@ void UYVYToARGBRow_C(const uint8* src_uyvy,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_uyvy += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1707,11 +1659,11 @@ void I422ToRGBARow_C(const uint8* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
rgb_buf[0] = 255;
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+ rgb_buf + 7, yuvconstants);
rgb_buf[4] = 255;
src_y += 2;
src_u += 1;
@@ -1719,8 +1671,8 @@ void I422ToRGBARow_C(const uint8* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
rgb_buf[0] = 255;
}
}
@@ -1800,7 +1752,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
}
}
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -1837,8 +1791,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_C(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
// Output a row of UV values, filtering 2 rows of YUY2.
int x;
for (x = 0; x < width; x += 2) {
@@ -1852,7 +1809,9 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
// Copy row of YUY2 UV's (422) into U and V (422).
void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
// Output a row of UV values.
int x;
for (x = 0; x < width; x += 2) {
@@ -1879,8 +1838,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
}
// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_C(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
// Output a row of UV values.
int x;
for (x = 0; x < width; x += 2) {
@@ -1894,7 +1856,9 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
// Copy row of UYVY UV's (422) into U and V (422).
void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
// Output a row of UV values.
int x;
for (x = 0; x < width; x += 2) {
@@ -1925,8 +1889,10 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBBlendRow_C(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
uint32 fb = src_argb0[0];
@@ -1973,9 +1939,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
}
#undef BLEND
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
@@ -2039,38 +2008,43 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
#define T(a) 0x01000000 + (0x10000 / a)
const uint32 fixed_invtbl8[256] = {
- 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
- T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
- T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
- T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
- T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
- T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
- T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
- T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
- T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
- T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
- T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
- T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
- T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
- T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
- T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
- T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
- T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
- T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
- T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
- T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
- T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
- T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
- T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
- T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
- T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
- T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
- T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
- T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
- T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
- T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
- T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
- T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
+ T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
+ T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
+ T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b),
+ T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22),
+ T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29),
+ T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30),
+ T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e),
+ T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45),
+ T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c),
+ T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53),
+ T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a),
+ T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61),
+ T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68),
+ T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76),
+ T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d),
+ T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84),
+ T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b),
+ T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92),
+ T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99),
+ T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0),
+ T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae),
+ T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5),
+ T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc),
+ T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3),
+ T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca),
+ T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1),
+ T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8),
+ T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6),
+ T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed),
+ T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4),
+ T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb),
+ T(0xfc), T(0xfd), T(0xfe), 0x01000100};
#undef T
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -2094,8 +2068,10 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
}
}
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_C(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
int32 row_sum[4] = {0, 0, 0, 0};
int x;
for (x = 0; x < width; ++x) {
@@ -2103,15 +2079,19 @@ void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
row_sum[1] += row[x * 4 + 1];
row_sum[2] += row[x * 4 + 2];
row_sum[3] += row[x * 4 + 3];
- cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
- cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
- cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
- cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
}
}
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
- int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32* tl,
+ const int32* bl,
+ int w,
+ int area,
+ uint8* dst,
+ int count) {
float ooa = 1.0f / area;
int i;
for (i = 0; i < count; ++i) {
@@ -2127,8 +2107,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
// Copy pixels from rotated source to destination row with a slope.
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
+void ARGBAffineRow_C(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width) {
int i;
// Render a row of pixels from source into a buffer.
float uv[2];
@@ -2138,8 +2121,7 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
int x = (int)(uv[0]);
int y = (int)(uv[1]);
*(uint32*)(dst_argb) =
- *(const uint32*)(src_argb + y * src_argb_stride +
- x * 4);
+ *(const uint32*)(src_argb + y * src_argb_stride + x * 4);
dst_argb += 4;
uv[0] += uv_dudv[2];
uv[1] += uv_dudv[3];
@@ -2147,16 +2129,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
}
// Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
- uint8* dst_uv, int width) {
+static void HalfRow_C(const uint8* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint8* dst_uv,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
}
}
-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
- uint16* dst_uv, int width) {
+static void HalfRow_16_C(const uint16* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint16* dst_uv,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -2164,10 +2150,12 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
}
// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8* dst_ptr,
+ const uint8* src_ptr,
ptrdiff_t src_stride,
- int width, int source_y_fraction) {
- int y1_fraction = source_y_fraction ;
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
int x;
@@ -2194,9 +2182,11 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
ptrdiff_t src_stride,
- int width, int source_y_fraction) {
+ int width,
+ int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16* src_ptr1 = src_ptr + src_stride;
@@ -2222,8 +2212,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
int index0 = shuffler[0];
int index1 = shuffler[1];
int index2 = shuffler[2];
@@ -2248,7 +2240,8 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_frame, int width) {
+ uint8* dst_frame,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_frame[0] = src_y[0];
@@ -2271,7 +2264,8 @@ void I422ToYUY2Row_C(const uint8* src_y,
void I422ToUYVYRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_frame, int width) {
+ uint8* dst_frame,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_frame[0] = src_u[0];
@@ -2291,7 +2285,6 @@ void I422ToUYVYRow_C(const uint8* src_y,
}
}
-
void ARGBPolynomialRow_C(const uint8* src_argb,
uint8* dst_argb,
const float* poly,
@@ -2332,8 +2325,30 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
}
}
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- const uint8* luma, uint32 lumacoeff) {
+// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
+ int i;
+ float mult = 1.9259299444e-34f * scale;
+ for (i = 0; i < width; ++i) {
+ float value = src[i] * mult;
+ dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff) {
uint32 bc = lumacoeff & 0xff;
uint32 gc = (lumacoeff >> 8) & 0xff;
uint32 rc = (lumacoeff >> 16) & 0xff;
@@ -2341,15 +2356,17 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
int i;
for (i = 0; i < width - 1; i += 2) {
// Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
+ const uint8* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
const uint8* luma1;
dst_argb[0] = luma0[src_argb[0]];
dst_argb[1] = luma0[src_argb[1]];
dst_argb[2] = luma0[src_argb[2]];
dst_argb[3] = src_argb[3];
- luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
- src_argb[6] * rc) & 0x7F00u) + luma;
+ luma1 =
+ ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+ luma;
dst_argb[4] = luma1[src_argb[4]];
dst_argb[5] = luma1[src_argb[5]];
dst_argb[6] = luma1[src_argb[6]];
@@ -2359,8 +2376,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
}
if (width & 1) {
// Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
+ const uint8* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
dst_argb[0] = luma0[src_argb[0]];
dst_argb[1] = luma0[src_argb[1]];
dst_argb[2] = luma0[src_argb[2]];
@@ -2504,7 +2522,7 @@ void I422ToRGB565Row_AVX2(const uint8* src_y,
uint8* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2530,7 +2548,7 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2556,7 +2574,7 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2576,13 +2594,13 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
#if defined(HAS_I422TORGB24ROW_AVX2)
void I422ToRGB24Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2604,7 +2622,7 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc
new file mode 100644
index 00000000..466dd5d9
--- /dev/null
+++ b/files/source/row_dspr2.cc
@@ -0,0 +1,1721 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__(
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
+ // past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sgtu $v1, %[dst], $t9 \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sgtu $v1,%[dst],$t9 \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r"(dst), [src] "+r"(src)
+ : [count] "r"(count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
+ "at");
+}
+#endif // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
+ (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 |
+ // U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 |
+ // U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 |
+ // U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 |
+ // V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 |
+ // U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
+ [dst_v] "+r"(dst_v)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r"(src), [dst] "+r"(dst)
+ : [width] "r"(width)
+ : "t0", "t1", "t2", "t3", "t4", "t5");
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ int y;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
+ [x] "=&r"(x), [y] "=&r"(y)
+ : [width] "r"(width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
+}
+
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x7fff7fff;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
+ [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+ [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+ [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 4 pixels.
+ }
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra_r.ph $t6, $t6, 8 \n"
+ "shra_r.ph $t7, $t7, 8 \n"
+ "shra_r.ph $t2, $t2, 8 \n"
+ "shra_r.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
+ [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
+ : [source_y_fraction] "r"(source_y_fraction),
+ [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+#include <stdio.h>
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1;
+ for (x = 0; x < (width - 1); ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0(%[src_rgb24]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_rgb24], %[src_rgb24], 3 \n"
+ "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1)
+ : [tmp_mask] "r"(tmp_mask)
+ : "memory");
+ }
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+}
+
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1, tmp_t2;
+ for (x = 0; x < (width - 1); ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0(%[src_raw]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_raw], %[src_raw], 3 \n"
+ "srl %[tmp_t2], %[tmp_t1], 16 \n"
+ "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
+ "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"
+ "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
+ : [tmp_mask] "r"(tmp_mask)
+ : "memory");
+ }
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+}
+
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_mask = 0xff;
+ uint32 tmp_t1, tmp_t2, tmp_t3;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lhu %[tmp_t1], 0(%[src_rgb565]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_rgb565], %[src_rgb565], 2 \n"
+ "sll %[tmp_t2], %[tmp_t1], 8 \n"
+ "ins %[tmp_t2], %[tmp_mask], 24,8 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"
+ "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"
+ "srl %[tmp_t3], %[tmp_t1], 9 \n"
+ "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
+ "srl %[tmp_t3], %[tmp_t1], 2 \n"
+ "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
+ "sw %[tmp_t2], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
+ [dst_argb] "+r"(dst_argb)
+ : [tmp_mask] "r"(tmp_mask));
+ }
+}
+
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_t1, tmp_t2, tmp_t3;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lh %[tmp_t1], 0(%[src_argb1555]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_argb1555], %[src_argb1555], 2 \n"
+ "sll %[tmp_t2], %[tmp_t1], 9 \n"
+ "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"
+ "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"
+ "srl %[tmp_t3], %[tmp_t1], 7 \n"
+ "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"
+ "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
+ "srl %[tmp_t3], %[tmp_t1], 2 \n"
+ "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
+ "sw %[tmp_t2], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
+ [dst_argb] "+r"(dst_argb)
+ :);
+ }
+}
+
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ uint32 tmp_t1;
+ for (x = 0; x < width; ++x) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lh %[tmp_t1], 0(%[src_argb4444]) \n"
+ "addiu %[dst_argb], %[dst_argb], 4 \n"
+ "addiu %[src_argb4444], %[src_argb4444], 2 \n"
+ "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"
+ "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"
+ "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"
+ "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"
+ "sw %[tmp_t1], -4(%[dst_argb]) \n"
+ ".set pop \n"
+ : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
+ [tmp_t1] "=&r"(tmp_t1));
+ }
+}
+
+void I444ToARGBRow_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_mask = 0x7fff7fff;
+ uint32 tmp_yg;
+
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[y_buf]) \n"
+ "lbu %[tmp_t1], 1(%[y_buf]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lh %[tmp_t2], 0(%[u_buf]) \n"
+ "lh %[tmp_t3], 0(%[v_buf]) \n"
+ "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
+ [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
+ [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+ [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+ [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+ y_buf += 2;
+ u_buf += 2;
+ v_buf += 2;
+ rgb_buf += 8; // Advance 1 pixel.
+ }
+}
+
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x7fff7fff;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"
+ "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"
+ "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"
+ "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"
+ "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"
+ "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"
+ "sw %[tmp_t8], 0(%[dst_argb4444]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
+ [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+ [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+ [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+ [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+}
+
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_yg;
+ uint32 tmp_mask = 0x80008000;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_u]) \n"
+ "lbu %[tmp_t3], 0(%[src_v]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"
+ "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"
+ "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"
+ "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"
+ "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"
+ "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"
+ "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"
+ "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"
+ "sw %[tmp_t8], 0(%[dst_argb1555]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
+ [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+ [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+ [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+ [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+}
+
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint32 tmp_ub = yuvconstants->kUVToB[0];
+ uint32 tmp_ug = yuvconstants->kUVToG[0];
+ uint32 tmp_vg = yuvconstants->kUVToG[1];
+ uint32 tmp_vr = yuvconstants->kUVToR[1];
+ uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+ uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+ uint32 tmp_br = yuvconstants->kUVBiasR[0];
+ uint32 yg = yuvconstants->kYToRgb[0];
+ uint32 tmp_mask = 0x7fff7fff;
+ uint32 tmp_yg;
+ tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+ tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+ tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+ tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+ tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+ tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+ tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+ tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+ yg = yg * 0x0101;
+
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lbu %[tmp_t7], 0(%[src_y]) \n"
+ "lbu %[tmp_t1], 1(%[src_y]) \n"
+ "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
+ "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
+ "lbu %[tmp_t2], 0(%[src_uv]) \n"
+ "lbu %[tmp_t3], 1(%[src_uv]) \n"
+ "replv.ph %[tmp_t2], %[tmp_t2] \n"
+ "replv.ph %[tmp_t3], %[tmp_t3] \n"
+ "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
+ "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
+ "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
+ "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
+ "srl %[tmp_t7], %[tmp_t7], 16 \n"
+ "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
+ "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
+ "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
+ "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
+ "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
+ "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
+ "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
+ "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
+ "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
+ "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
+ "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
+ "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
+ "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
+ "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
+ "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
+ "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
+ "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
+ "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
+ "sw %[tmp_t8], 0(%[rgb_buf]) \n"
+ "sw %[tmp_t7], 4(%[rgb_buf]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+ : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
+ [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
+ [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
+ [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
+ [tmp_mask] "r"(tmp_mask));
+
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffda0000;
+ int const2 = 0x0070ffb6;
+ int const3 = 0x00700000;
+ int const4 = 0xffeeffa2;
+ int const5 = 0x100;
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00420000;
+ int const2 = 0x00190081;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb6ffda;
+ int const2 = 0x00000070;
+ int const3 = 0xffa20070;
+ int const4 = 0x0000ffee;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00810019;
+ int const2 = 0x00000042;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00810042;
+ int const2 = 0x00000019;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb60070;
+ int const2 = 0x0000ffda;
+ int const3 = 0xffa2ffee;
+ int const4 = 0x00000070;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"
+ "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"
+ "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"
+ "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ int const1 = 0x00420081;
+ int const2 = 0x00190000;
+ int const5 = 0x40;
+ for (x = 0; x < width; x += 4) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_argb0]) \n"
+ "lw %[tmp_t2], 4(%[src_argb0]) \n"
+ "lw %[tmp_t3], 8(%[src_argb0]) \n"
+ "lw %[tmp_t4], 12(%[src_argb0]) \n"
+ "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "mult $ac2, %[const5], %[const5] \n"
+ "mult $ac3, %[const5], %[const5] \n"
+ "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
+ "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
+ "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
+ "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
+ "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
+ "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
+ "extr_r.w %[tmp_t1], $ac0, 8 \n"
+ "extr_r.w %[tmp_t2], $ac1, 8 \n"
+ "extr_r.w %[tmp_t3], $ac2, 8 \n"
+ "extr_r.w %[tmp_t4], $ac3, 8 \n"
+ "addiu %[dst_y], %[dst_y], 4 \n"
+ "addiu %[src_argb0],%[src_argb0], 16 \n"
+ "sb %[tmp_t1], -4(%[dst_y]) \n"
+ "sb %[tmp_t2], -3(%[dst_y]) \n"
+ "sb %[tmp_t3], -2(%[dst_y]) \n"
+ "sb %[tmp_t4], -1(%[dst_y]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+ : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+ "$ac3hi");
+ }
+}
+
+void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+ int x;
+ int const1 = 0xffb60070;
+ int const2 = 0x0000ffda;
+ int const3 = 0xffa2ffee;
+ int const4 = 0x00000070;
+ int const5 = 0x100;
+
+ for (x = 0; x < width - 1; x += 2) {
+ int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+ int tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t1], 0(%[src_rgb0]) \n"
+ "lw %[tmp_t2], 4(%[src_rgb0]) \n"
+ "lw %[tmp_t3], 0(%[src_rgb1]) \n"
+ "lw %[tmp_t4], 4(%[src_rgb1]) \n"
+ "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
+ "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
+ "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
+ "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
+ "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
+ "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
+ "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
+ "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
+ "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
+ "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
+ "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
+ "mult $ac0, %[const5], %[const5] \n"
+ "mult $ac1, %[const5], %[const5] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
+ "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
+ "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
+ "extr_r.w %[tmp_t7], $ac0, 9 \n"
+ "extr_r.w %[tmp_t8], $ac1, 9 \n"
+ "addiu %[src_rgb0], %[src_rgb0], 8 \n"
+ "addiu %[src_rgb1], %[src_rgb1], 8 \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "addiu %[dst_v], %[dst_v], 1 \n"
+ "sb %[tmp_t7], -1(%[dst_u]) \n"
+ "sb %[tmp_t8], -1(%[dst_v]) \n"
+ ".set pop \n"
+ : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+ [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+ [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+ [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+ [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+ : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+ [const4] "r"(const4), [const5] "r"(const5)
+ : "hi", "lo", "$ac1lo", "$ac1hi");
+ }
+}
+
+#endif // __mips_dsp_rev >= 2
+
+#endif // defined(__mips__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index 1ac7ef1a..8735070b 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc
@@ -1,4 +1,3 @@
-// VERSION 2
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
@@ -23,165 +22,133 @@ extern "C" {
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
-static vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
// JPeg full range.
-static vec8 kARGBToYJ = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-static vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
-static vec8 kARGBToUJ = {
- 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
static vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
-static vec8 kARGBToVJ = {
- -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
// Constants for BGRA
-static vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
-static vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
-static vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR
-static vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
-static vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
-static vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA.
-static vec8 kRGBAToY = {
- 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
-static vec8 kRGBAToU = {
- 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
-static vec8 kRGBAToV = {
- 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
-static uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
// 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
- 64, 64, 64, 64, 64, 64, 64, 64
-};
+static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-static uvec16 kAddUVJ128 = {
- 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
// Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
+ 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
// Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting RAW to RGB24. Middle 8.
static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting RAW to RGB24. Last 8.
static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RGB24.
static uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RAW.
static uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
static uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
// NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
};
#endif // HAS_RGB24TOARGBROW_SSSE3
@@ -191,7 +158,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -220,7 +187,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
@@ -258,7 +225,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
@@ -296,7 +263,7 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
"movdqa %4,%%xmm4 \n"
"movdqa %5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
@@ -338,7 +305,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
"sub %0,%1 \n"
"sub %0,%1 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -385,7 +352,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
"sub %0,%1 \n"
"sub %0,%1 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -429,7 +396,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
"sub %0,%1 \n"
"sub %0,%1 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pand %%xmm4,%%xmm0 \n"
@@ -461,7 +428,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile (
"movdqa %3,%%xmm6 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -499,7 +466,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile (
"movdqa %3,%%xmm6 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -543,7 +510,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -569,98 +536,99 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
);
}
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
- const uint32 dither4, int width) {
- asm volatile (
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void ARGBToRGB565DitherRow_SSE2(const uint8* src,
+ uint8* dst,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
- const uint32 dither4, int width) {
- asm volatile (
- "vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void ARGBToRGB565DitherRow_AVX2(const uint8* src,
+ uint8* dst,
+ const uint32 dither4,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3,%%xmm6 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTORGB565DITHERROW_AVX2
-
void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
@@ -671,8 +639,9 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
"pslld $0xa,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n"
"pslld $0xf,%%xmm7 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
@@ -708,8 +677,9 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
"psllw $0xc,%%xmm4 \n"
"movdqa %%xmm4,%%xmm3 \n"
"psrlw $0x8,%%xmm3 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
@@ -737,8 +707,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -775,8 +746,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -809,9 +781,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
#ifdef HAS_ARGBTOYROW_AVX2
// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
- 0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
@@ -819,8 +789,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
@@ -860,8 +831,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
@@ -896,15 +868,19 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
#endif // HAS_ARGBTOYJROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"movdqa %5,%%xmm3 \n"
"movdqa %6,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
@@ -961,18 +937,21 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#ifdef HAS_ARGBTOUVROW_AVX2
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
@@ -981,7 +960,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
- "lea " MEMLEA(0x80,0) ",%0 \n"
+ "lea " MEMLEA(0x80,0) ",%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
@@ -1004,9 +983,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
"vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1024,15 +1003,19 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ "sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
@@ -1085,15 +1068,19 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVJROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"movdqa %5,%%xmm3 \n"
"movdqa %6,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
@@ -1149,15 +1136,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVJROW_SSSE3
#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"movdqa %4,%%xmm3 \n"
"movdqa %5,%%xmm4 \n"
"movdqa %6,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1209,8 +1199,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1239,15 +1230,19 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
);
}
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"movdqa %5,%%xmm3 \n"
"movdqa %6,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
@@ -1304,8 +1299,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1338,8 +1334,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
@@ -1368,15 +1365,19 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
);
}
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"movdqa %5,%%xmm3 \n"
"movdqa %6,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
@@ -1429,15 +1430,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
}
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"movdqa %5,%%xmm3 \n"
"movdqa %6,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
@@ -1493,8 +1498,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
// Read 8 UV from 444
-#define READYUV444 \
- "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUV444 \
+ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
@@ -1503,8 +1508,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUV422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
@@ -1514,8 +1519,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUVA422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
@@ -1526,29 +1531,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
"lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
-// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-// __asm pinsrw xmm0, [esi], 0 /* U */
-// __asm pinsrw xmm1, [esi + edi], 0 /* V */
-#define READYUV411_TEMP \
- "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
- "movd %[temp],%%xmm0 \n" \
- MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
- "movd %[temp],%%xmm1 \n" \
- "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "punpckldq %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
-
// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
- "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+#define READNV12 \
+ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
@@ -1556,8 +1541,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21 \
- "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+#define READNV21 \
+ "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
"pshufb %[kShuffleNV21], %%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
@@ -1565,24 +1550,24 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2 \
- "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
+#define READYUY2 \
+ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
"pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
"movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
"pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
"lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY \
- "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
+#define READUYVY \
+ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
"pshufb %[kShuffleUYVYY], %%xmm4 \n" \
"movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
"pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
"lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants) \
- "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
+#define YUVTORGB_SETUP(yuvconstants) \
+ "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
"movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
"movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
"movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
@@ -1590,37 +1575,37 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
"movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa %%xmm11,%%xmm0 \n" \
- "pmaddubsw %%xmm8,%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa %%xmm12,%%xmm1 \n" \
- "pmaddubsw %%xmm9,%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa %%xmm13,%%xmm2 \n" \
- "pmaddubsw %%xmm10,%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw %%xmm14,%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa %%xmm11,%%xmm0 \n" \
+ "pmaddubsw %%xmm8,%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa %%xmm12,%%xmm1 \n" \
+ "pmaddubsw %%xmm9,%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa %%xmm13,%%xmm2 \n" \
+ "pmaddubsw %%xmm10,%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw %%xmm14,%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
#define YUVTORGB_REGS \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
#else
#define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
+#define YUVTORGB(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \
"movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
@@ -1646,8 +1631,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#endif
// Store 8 ARGB values.
-#define STOREARGB \
- "punpcklbw %%xmm1,%%xmm0 \n" \
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklbw %%xmm5,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"punpcklwd %%xmm2,%%xmm0 \n" \
@@ -1657,8 +1642,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
// Store 8 RGBA values.
-#define STORERGBA \
- "pcmpeqb %%xmm5,%%xmm5 \n" \
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
"punpcklbw %%xmm2,%%xmm1 \n" \
"punpcklbw %%xmm0,%%xmm5 \n" \
"movdqa %%xmm5,%%xmm0 \n" \
@@ -1678,8 +1663,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREARGB
@@ -1707,8 +1693,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
"movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
"movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
"sub %[u_buf],%[v_buf] \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
"punpcklbw %%xmm1,%%xmm0 \n"
@@ -1728,7 +1715,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
@@ -1751,8 +1738,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STOREARGB
@@ -1777,11 +1765,13 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
@@ -1792,7 +1782,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf]
[a_buf]"+r"(a_buf), // %[a_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
@@ -1801,55 +1791,22 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
: "memory", "cc", NACL_R14 YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_I422ALPHATOARGBROW_SSSE3
-#ifdef HAS_I411TOARGBROW_SSSE3
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- int temp;
- asm volatile (
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- READYUV411_TEMP
- YUVTORGB(yuvconstants)
- STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [temp]"=&r"(temp), // %[temp]
-#if defined(__i386__) && defined(__pic__)
- [width]"+m"(width) // %[width]
-#else
- [width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif
-
void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12
YUVTORGB(yuvconstants)
STOREARGB
@@ -1863,6 +1820,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
: "memory", "cc", YUVTORGB_REGS // Does not use r14.
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
@@ -1870,11 +1828,13 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
@@ -1889,17 +1849,20 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
: "memory", "cc", YUVTORGB_REGS // Does not use r14.
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
@@ -1914,17 +1877,20 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
: "memory", "cc", YUVTORGB_REGS // Does not use r14.
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
@@ -1939,6 +1905,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
: "memory", "cc", YUVTORGB_REGS // Does not use r14.
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
@@ -1951,8 +1918,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STORERGBA
@@ -1972,8 +1940,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3
// Read 16 UV from 444
-#define READYUV444_AVX2 \
- "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUV444_AVX2 \
+ "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
@@ -1985,8 +1953,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUV422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
@@ -1998,8 +1966,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+#define READYUVA422_AVX2 \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
@@ -2013,23 +1981,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpermq $0xd8,%%ymm5,%%ymm5 \n" \
"lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 \
- "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
-
// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
- "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+#define READNV12_AVX2 \
+ "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
@@ -2039,8 +1993,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
- "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
+#define READNV21_AVX2 \
+ "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
@@ -2050,53 +2004,57 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
- "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
+#define READYUY2_AVX2 \
+ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
"vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
"vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
"vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
"lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
- "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
+#define READUYVY_AVX2 \
+ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
"vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
"vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
"vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
"lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
- "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
"vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
"vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
"vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
"vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
"vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
"vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
-#define YUVTORGB_AVX2(yuvconstants) \
- "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
- "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
- "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
- "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
- "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
- "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
- "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
+ "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
+ "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
+ "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
+ "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
#define YUVTORGB_REGS_AVX2 \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
#else // Convert 16 pixels: 16 UV and 16 Y.
+
#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants) \
- "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
+#define YUVTORGB_AVX2(yuvconstants) \
+ "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
"vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
"vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
"vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
@@ -2119,8 +2077,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif
// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
"vpermq $0xd8,%%ymm2,%%ymm2 \n" \
@@ -2143,8 +2101,9 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2163,39 +2122,6 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I444TOARGBROW_AVX2
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- READYUV411_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_I411TOARGBROW_AVX2
-
#if defined(HAS_I422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2209,13 +2135,15 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
"sub $0x10,%[width] \n"
"jg 1b \n"
+
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
@@ -2233,17 +2161,19 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2255,7 +2185,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf]
[a_buf]"+r"(a_buf), // %[a_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
@@ -2264,6 +2194,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
: "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_I422ALPHATOARGBROW_AVX2
@@ -2280,8 +2211,9 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
@@ -2318,11 +2250,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2337,6 +2271,7 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
: "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_NV12TOARGBROW_AVX2
@@ -2348,11 +2283,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2368,6 +2305,7 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
: "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_NV21TOARGBROW_AVX2
@@ -2378,11 +2316,13 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2398,6 +2338,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
: "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_YUY2TOARGBROW_AVX2
@@ -2408,11 +2349,13 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
+ // clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
@@ -2428,6 +2371,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
: "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ // clang-format on
}
#endif // HAS_UYVYTOARGBROW_AVX2
@@ -2442,8 +2386,9 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
@@ -2491,7 +2436,7 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
"vpslld $0x18,%%ymm4,%%ymm4 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
// Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
"vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
@@ -2525,16 +2470,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"movdqa %3,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
@@ -2556,8 +2501,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"vbroadcastf128 %3,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
@@ -2578,18 +2524,20 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8* src,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"movdqa %4,%%xmm1 \n"
"lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
@@ -2615,8 +2563,9 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufd $0x1b,%%xmm0,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n"
@@ -2636,15 +2585,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBMIRRORROW_AVX2
// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile (
"vmovdqu %3,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
@@ -2662,31 +2610,34 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -2699,30 +2650,33 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -2735,25 +2689,28 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width) {
asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
"vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
"vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
"vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -2766,23 +2723,26 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width) {
asm volatile (
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -2801,8 +2761,9 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
"jne 2f \n"
"test $0xf,%1 \n"
"jne 2f \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -2812,6 +2773,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
"sub $0x20,%2 \n"
"jg 1b \n"
"jmp 9f \n"
+
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
@@ -2837,7 +2799,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
asm volatile (
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -2860,14 +2822,12 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
// Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile (
- "rep movsb " MEMMOVESTRING(0,1) " \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc"
- );
+ asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc");
}
#endif // HAS_COPYROW_ERMS
@@ -2879,8 +2839,9 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"pslld $0x18,%%xmm0 \n"
"pcmpeqb %%xmm1,%%xmm1 \n"
"psrld $0x8,%%xmm1 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -2913,8 +2874,9 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
asm volatile (
"vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
"vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -2939,9 +2901,9 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
// width in pixels
void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
+ asm volatile (
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ", %%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
"lea " MEMLEA(0x20, 0) ", %0 \n"
@@ -2963,6 +2925,47 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
}
#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+ 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
+ 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
+ asm volatile (
+ "vmovdqa %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
+ "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
+ "lea " MEMLEA(0x80, 0) ", %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ : "m"(kPermdARGBToY_AVX), // %3
+ "m"(kShuffleAlphaShort_AVX2) // %4
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
@@ -2971,8 +2974,9 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"pslld $0x18,%%xmm0 \n"
"pcmpeqb %%xmm1,%%xmm1 \n"
"psrld $0x8,%%xmm1 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm2,%%xmm2 \n"
@@ -3007,8 +3011,9 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
asm volatile (
"vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
"vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
"vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
@@ -3036,32 +3041,29 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
void SetRow_X86(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width >> 2);
const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
- asm volatile (
- "rep stosl " MEMSTORESTRING(eax,0) " \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
+ asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
}
void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile (
- "rep stosb " MEMSTORESTRING(al,0) " \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v8) // %2
- : "memory", "cc");
+ asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
}
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile (
- "rep stosl " MEMSTORESTRING(eax,0) " \n"
- : "+D"(dst_argb), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
+ asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
}
#endif // HAS_SETROW_X86
@@ -3070,8 +3072,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -3091,14 +3094,18 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
);
}
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
@@ -3130,13 +3137,16 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
}
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -3166,7 +3176,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile (
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -3186,14 +3196,18 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
);
}
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
@@ -3225,13 +3239,16 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -3264,8 +3281,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -3287,14 +3305,18 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
);
}
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
@@ -3327,13 +3349,16 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
}
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -3366,7 +3391,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
asm volatile (
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -3387,15 +3412,18 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
, "xmm0", "xmm1", "xmm5"
);
}
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
@@ -3428,13 +3456,16 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
}
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
"sub %1,%2 \n"
+
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -3467,14 +3498,14 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
@@ -3559,46 +3590,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
-
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
- );
+void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
}
#endif // HAS_BLENDPLANEROW_SSSE3
@@ -3608,67 +3642,67 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
- "vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
- "vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
-
- // 32 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- :: "memory", "cc", "eax",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
+ "vbroadcastss %%xmm6,%%ymm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
+ "vbroadcastss %%xmm7,%%ymm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 32 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
+static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+ 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
// Attenuate 4 pixels at a time.
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
@@ -3679,7 +3713,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
@@ -3714,9 +3748,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
@@ -3727,7 +3761,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
"vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
"vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
@@ -3757,13 +3791,14 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
int width) {
uintptr_t alpha;
asm volatile (
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movzb " MEMACCESS2(0x03,0) ",%3 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -3804,10 +3839,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
int width) {
uintptr_t alpha;
asm volatile (
@@ -3816,7 +3851,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
// replace VPGATHER
"movzb " MEMACCESS2(0x03,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
@@ -3879,7 +3914,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
@@ -3922,17 +3957,14 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
// Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
-static vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
-static vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
@@ -3943,7 +3975,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
@@ -3995,8 +4027,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
asm volatile (
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
"pshufd $0x00,%%xmm5,%%xmm2 \n"
@@ -4006,7 +4040,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
@@ -4058,8 +4092,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
+void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
asm volatile (
"movd %2,%%xmm2 \n"
"movd %3,%%xmm3 \n"
@@ -4076,7 +4113,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
@@ -4108,7 +4145,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value) {
asm volatile (
"movd %3,%%xmm2 \n"
@@ -4117,7 +4156,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -4144,14 +4183,16 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
@@ -4182,14 +4223,16 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
@@ -4221,12 +4264,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBAddRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
@@ -4249,12 +4294,14 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBAddRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
@@ -4277,12 +4324,14 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
@@ -4305,12 +4354,14 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
@@ -4318,7 +4369,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
"vmovdqu %%ymm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x20,2) ",%2 \n"
"sub $0x8,%3 \n"
- "jg 1b \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4336,8 +4387,11 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
@@ -4346,7 +4400,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
@@ -4390,8 +4444,10 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
@@ -4399,7 +4455,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
"punpcklbw %%xmm5,%%xmm0 \n"
@@ -4443,8 +4499,10 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -4452,7 +4510,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
@@ -4490,8 +4548,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -4499,7 +4559,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
@@ -4525,15 +4585,17 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
@@ -4572,8 +4634,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
asm volatile (
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
@@ -4582,9 +4646,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"test $0xf,%1 \n"
"jne 49f \n"
- // 4 pixel loop \n"
+ // 4 pixel loop.
LABELALIGN
- "40: \n"
+ "40: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm2,%%xmm4 \n"
@@ -4617,13 +4681,13 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"sub $0x4,%3 \n"
"jge 40b \n"
- "49: \n"
+ "49: \n"
"add $0x3,%3 \n"
"jl 19f \n"
- // 1 pixel loop \n"
+ // 1 pixel loop.
LABELALIGN
- "10: \n"
+ "10: \n"
"movd " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
@@ -4637,7 +4701,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"sub $0x1,%3 \n"
"jge 10b \n"
- "19: \n"
+ "19: \n"
: "+r"(row), // %0
"+r"(cumsum), // %1
"+r"(previous_cumsum), // %2
@@ -4650,8 +4714,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
int count) {
asm volatile (
"movd %5,%%xmm5 \n"
@@ -4672,7 +4739,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
"cvtps2dq %%xmm5,%%xmm5 \n"
"packssdw %%xmm5,%%xmm5 \n"
- // 4 pixel small loop \n"
+ // 4 pixel small loop.
LABELALIGN
"4: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
@@ -4783,8 +4850,11 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* src_dudv, int width) {
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* src_dudv,
+ int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile (
@@ -4868,8 +4938,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
@@ -4891,7 +4963,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// General purpose row blend.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm2)
"movdqa %%xmm0,%%xmm1 \n"
@@ -4949,8 +5021,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
int source_y_fraction) {
asm volatile (
"cmp $0x0,%3 \n"
@@ -4972,7 +5046,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
// General purpose row blend.
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
@@ -5025,12 +5099,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
asm volatile (
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -5053,12 +5129,14 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBSHUFFLEROW_AVX2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
asm volatile (
"vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
@@ -5082,8 +5160,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBSHUFFLEROW_SSE2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
uintptr_t pixel_temp;
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
@@ -5098,7 +5178,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"je 2103f \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movzb " MEMACCESS(4) ",%2 \n"
MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
"mov %b2," MEMACCESS(1) " \n"
@@ -5204,11 +5284,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
+ uint8* dst_frame,
+ int width) {
+ asm volatile (
"sub %1,%2 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(1) ",%%xmm2 \n"
MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
"lea " MEMLEA(0x8,1) ",%1 \n"
@@ -5239,11 +5320,12 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_frame, int width) {
- asm volatile (
+ uint8* dst_frame,
+ int width) {
+ asm volatile (
"sub %1,%2 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(1) ",%%xmm2 \n"
MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
"lea " MEMLEA(0x8,1) ",%1 \n"
@@ -5272,14 +5354,15 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
+ uint8* dst_argb,
+ const float* poly,
int width) {
asm volatile (
"pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm3,%%xmm0 \n"
@@ -5328,7 +5411,8 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
+ uint8* dst_argb,
+ const float* poly,
int width) {
asm volatile (
"vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
@@ -5338,7 +5422,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
// 2 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
"lea " MEMLEA(0x8,0) ",%0 \n"
"vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
@@ -5366,15 +5450,150 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "pshufd $0x0,%3,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
+ "add $0x10,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
+ "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
+ "punpckhwd %%xmm5,%%xmm3 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "psrld $0xd,%%xmm2 \n"
+ "psrld $0xd,%%xmm3 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale * kScaleBias) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale * kScaleBias) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+ MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "x"(scale) // %3
+ : "memory", "cc",
+ "xmm2", "xmm3", "xmm4"
+ );
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+ MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc",
+ "xmm2", "xmm3"
+ );
+}
+#endif // HAS_HALFFLOATROW_F16C
+
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
int width) {
uintptr_t pixel_temp;
asm volatile (
// 1 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movzb " MEMACCESS(0) ",%1 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
@@ -5405,7 +5624,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
asm volatile (
// 1 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movzb " MEMACCESS(0) ",%1 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
@@ -5428,9 +5647,11 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
int width,
- const uint8* luma, uint32 lumacoeff) {
+ const uint8* luma,
+ uint32 lumacoeff) {
uintptr_t pixel_temp;
uintptr_t table_temp;
asm volatile (
@@ -5442,7 +5663,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 4 pixel loop.
LABELALIGN
- "1: \n"
+ "1: \n"
"movdqu " MEMACCESS(2) ",%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"phaddw %%xmm0,%%xmm0 \n"
diff --git a/files/source/row_mips.cc b/files/source/row_mips.cc
deleted file mode 100644
index 285f0b5a..00000000
--- a/files/source/row_mips.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
- __asm__ __volatile__ (
- ".set noreorder \n"
- ".set noat \n"
- "slti $at, %[count], 8 \n"
- "bne $at ,$zero, $last8 \n"
- "xor $t8, %[src], %[dst] \n"
- "andi $t8, $t8, 0x3 \n"
-
- "bne $t8, $zero, unaligned \n"
- "negu $a3, %[dst] \n"
- // make dst/src aligned
- "andi $a3, $a3, 0x3 \n"
- "beq $a3, $zero, $chk16w \n"
- // word-aligned now count is the remining bytes count
- "subu %[count], %[count], $a3 \n"
-
- "lwr $t8, 0(%[src]) \n"
- "addu %[src], %[src], $a3 \n"
- "swr $t8, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
-
- // Now the dst/src are mutually word-aligned with word-aligned addresses
- "$chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, chk8w \n"
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n"
- // t0 is the "past the end" address
-
- // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
- // the "t0-32" address
- // This means: for x=128 the last "safe" a1 address is "t0-160"
- // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
- // we will use "pref 30,128(a1)", so "t0-160" is the limit
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line of src
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // In case the a1 > t9 don't use "pref 30" at all
- "sgtu $v1, %[dst], $t9 \n"
- "bgtz $v1, $loop16w \n"
- "nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lw $t0, 0(%[src]) \n"
- "bgtz $v1, $skip_pref30_96 \n" // skip
- "lw $t1, 4(%[src]) \n"
- "pref 30, 96(%[dst]) \n" // continue
- "$skip_pref30_96: \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lw $t0, 32(%[src]) \n"
- "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
- "lw $t1, 36(%[src]) \n"
- "pref 30, 128(%[dst]) \n" // set dest, addr 128
- "$skip_pref30_128: \n"
- "lw $t2, 40(%[src]) \n"
- "lw $t3, 44(%[src]) \n"
- "lw $t4, 48(%[src]) \n"
- "lw $t5, 52(%[src]) \n"
- "lw $t6, 56(%[src]) \n"
- "lw $t7, 60(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
- "sgtu $v1, %[dst], $t9 \n"
- "bne %[dst], $a3, $loop16w \n"
- " addiu %[src], %[src], 64 \n" // adding 64 to src
- "move %[count], $t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count past 32-bytes
- "beq %[count], $t8, chk1w \n"
- // count=t8,no 32-byte chunk
- " nop \n"
-
- "lw $t0, 0(%[src]) \n"
- "lw $t1, 4(%[src]) \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, $last8 \n"
- " subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
- // copying in words (4-byte chunks)
- "$wordCopy_loop: \n"
- "lw $t3, 0(%[src]) \n"
- // the first t3 may be equal t0 ... optimize?
- "addiu %[src], %[src],4 \n"
- "addiu %[dst], %[dst],4 \n"
- "bne %[dst], $a3,$wordCopy_loop \n"
- " sw $t3, -4(%[dst]) \n"
-
- // For the last (<8) bytes
- "$last8: \n"
- "blez %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 -last dst address
- "$last8loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst], $a3, $last8loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "leave: \n"
- " j $ra \n"
- " nop \n"
-
- //
- // UNALIGNED case
- //
-
- "unaligned: \n"
- // got here with a3="negu a1"
- "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
- "beqz $a3, $ua_chk16w \n"
- " subu %[count], %[count], $a3 \n"
- // bytes left after initial a3 bytes
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
- "swr $v1, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
- // below the dst will be word aligned (NOTE1)
- "$ua_chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, ua_chk8w \n"
- // if a2==t8, no 64-byte chunks
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n" // t0 "past the end"
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line addr 32
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // safe, as we have at least 64 bytes ahead
- // In case the a1 > t9 don't use "pref 30" at all
- "sgtu $v1, %[dst], $t9 \n"
- "bgtz $v1, $ua_loop16w \n"
- // skip "pref 30,64(a1)" for too short arrays
- " nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$ua_loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "bgtz $v1, $ua_skip_pref30_96 \n"
- " lwl $t1, 7(%[src]) \n"
- "pref 30, 96(%[dst]) \n"
- // continue setting up the dest, addr 96
- "$ua_skip_pref30_96: \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lwr $t0, 32(%[src]) \n"
- "lwl $t0, 35(%[src]) \n"
- "lwr $t1, 36(%[src]) \n"
- "bgtz $v1, ua_skip_pref30_128 \n"
- " lwl $t1, 39(%[src]) \n"
- "pref 30, 128(%[dst]) \n"
- // continue setting up the dest, addr 128
- "ua_skip_pref30_128: \n"
-
- "lwr $t2, 40(%[src]) \n"
- "lwl $t2, 43(%[src]) \n"
- "lwr $t3, 44(%[src]) \n"
- "lwl $t3, 47(%[src]) \n"
- "lwr $t4, 48(%[src]) \n"
- "lwl $t4, 51(%[src]) \n"
- "lwr $t5, 52(%[src]) \n"
- "lwl $t5, 55(%[src]) \n"
- "lwr $t6, 56(%[src]) \n"
- "lwl $t6, 59(%[src]) \n"
- "lwr $t7, 60(%[src]) \n"
- "lwl $t7, 63(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst],%[dst],64 \n" // adding 64 to dest
- "sgtu $v1,%[dst],$t9 \n"
- "bne %[dst],$a3,$ua_loop16w \n"
- " addiu %[src],%[src],64 \n" // adding 64 to src
- "move %[count],$t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "ua_chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count
- "beq %[count], $t8, $ua_chk1w \n"
- // when count==t8, no 32-byte chunk
-
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "lwl $t1, 7(%[src]) \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "$ua_chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, ua_smallCopy \n"
- "subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
-
- // copying in words (4-byte chunks)
- "$ua_wordCopy_loop: \n"
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addiu %[src], %[src], 4 \n"
- "addiu %[dst], %[dst], 4 \n"
- // note: dst=a1 is word aligned here, see NOTE1
- "bne %[dst], $a3, $ua_wordCopy_loop \n"
- " sw $v1,-4(%[dst]) \n"
-
- // Now less than 4 bytes (value in count) left to copy
- "ua_smallCopy: \n"
- "beqz %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 = last dst address
- "$ua_smallCopy_loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst],$a3,$ua_smallCopy_loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "j $ra \n"
- " nop \n"
- ".set at \n"
- ".set reorder \n"
- : [dst] "+r" (dst), [src] "+r" (src)
- : [count] "r" (count)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
- "t8", "t9", "a3", "v1", "at"
- );
-}
-#endif // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
- "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
- "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
- "addiu %[src_uv], %[src_uv], 32 \n"
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
- "sw $t9, 0(%[dst_v]) \n"
- "sw $t0, 0(%[dst_u]) \n"
- "sw $t1, 4(%[dst_v]) \n"
- "sw $t2, 4(%[dst_u]) \n"
- "sw $t3, 8(%[dst_v]) \n"
- "sw $t5, 8(%[dst_u]) \n"
- "sw $t6, 12(%[dst_v]) \n"
- "sw $t7, 12(%[dst_u]) \n"
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [width] "+r" (width),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6", "t7", "t8", "t9"
- );
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "andi $t5, %[width], 0xf \n"
- "blez $t4, 2f \n"
- " addu %[src], %[src], %[width] \n" // src += width
-
- "1: \n"
- "lw $t0, -16(%[src]) \n" // |3|2|1|0|
- "lw $t1, -12(%[src]) \n" // |7|6|5|4|
- "lw $t2, -8(%[src]) \n" // |11|10|9|8|
- "lw $t3, -4(%[src]) \n" // |15|14|13|12|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t1, $t1 \n" // |6|7|4|5|
- "wsbh $t2, $t2 \n" // |10|11|8|9|
- "wsbh $t3, $t3 \n" // |14|15|12|13|
- "rotr $t0, $t0, 16 \n" // |0|1|2|3|
- "rotr $t1, $t1, 16 \n" // |4|5|6|7|
- "rotr $t2, $t2, 16 \n" // |8|9|10|11|
- "rotr $t3, $t3, 16 \n" // |12|13|14|15|
- "addiu %[src], %[src], -16 \n"
- "addiu $t4, $t4, -1 \n"
- "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
- "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
- "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
- "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
- "bgtz $t4, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
- "beqz $t5, 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -1(%[src]) \n"
- "addiu $t5, $t5, -1 \n"
- "addiu %[src], %[src], -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgez $t5, 2b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src] "+r" (src), [dst] "+r" (dst)
- : [width] "r" (width)
- : "t0", "t1", "t2", "t3", "t4", "t5"
- );
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- int x;
- int y;
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "addu $t4, %[width], %[width] \n"
- "srl %[x], %[width], 4 \n"
- "andi %[y], %[width], 0xf \n"
- "blez %[x], 2f \n"
- " addu %[src_uv], %[src_uv], $t4 \n"
-
- "1: \n"
- "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
- "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
- "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
- "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
- "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
- "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
- "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
- "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
-
- "rotr $t0, $t0, 16 \n" // |1|0|3|2|
- "rotr $t1, $t1, 16 \n" // |5|4|7|6|
- "rotr $t2, $t2, 16 \n" // |9|8|11|10|
- "rotr $t3, $t3, 16 \n" // |13|12|15|14|
- "rotr $t4, $t4, 16 \n" // |17|16|19|18|
- "rotr $t6, $t6, 16 \n" // |21|20|23|22|
- "rotr $t7, $t7, 16 \n" // |25|24|27|26|
- "rotr $t8, $t8, 16 \n" // |29|28|31|30|
- "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
- "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
- "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
- "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
- "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
- "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
- "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
- "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
- "addiu %[src_uv], %[src_uv], -32 \n"
- "addiu %[x], %[x], -1 \n"
- "swr $t4, 0(%[dst_u]) \n"
- "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
- "swr $t6, 0(%[dst_v]) \n"
- "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
- "swr $t3, 4(%[dst_v]) \n"
- "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
- "swr $t0, 8(%[dst_u]) \n"
- "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
- "swr $t1, 8(%[dst_v]) \n"
- "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
- "swr $t9, 12(%[dst_u]) \n"
- "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
- "swr $t5, 12(%[dst_v]) \n"
- "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz %[x], 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
- "beqz %[y], 3f \n"
- " nop \n"
- "b 2f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -2(%[src_uv]) \n"
- "lbu $t1, -1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], -2 \n"
- "addiu %[y], %[y], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[y], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v),
- [x] "=&r" (x),
- [y] "=&r" (y)
- : [width] "r" (width)
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t7", "t8", "t9"
- );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB \
- "lw $t0, 0(%[y_buf]) \n" \
- "lhu $t1, 0(%[u_buf]) \n" \
- "lhu $t2, 0(%[v_buf]) \n" \
- "preceu.ph.qbr $t1, $t1 \n" \
- "preceu.ph.qbr $t2, $t2 \n" \
- "preceu.ph.qbra $t3, $t0 \n" \
- "preceu.ph.qbla $t0, $t0 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t3, $t3, $s4 \n" \
- "subu.ph $t0, $t0, $s4 \n" \
- "mul.ph $t3, $t3, $s0 \n" \
- "mul.ph $t0, $t0, $s0 \n" \
- "shll.ph $t4, $t1, 0x7 \n" \
- "subu.ph $t4, $t4, $t1 \n" \
- "mul.ph $t6, $t1, $s1 \n" \
- "mul.ph $t1, $t2, $s2 \n" \
- "addq_s.ph $t5, $t4, $t3 \n" \
- "addq_s.ph $t4, $t4, $t0 \n" \
- "shra.ph $t5, $t5, 6 \n" \
- "shra.ph $t4, $t4, 6 \n" \
- "addiu %[u_buf], 2 \n" \
- "addiu %[v_buf], 2 \n" \
- "addu.ph $t6, $t6, $t1 \n" \
- "mul.ph $t1, $t2, $s3 \n" \
- "addu.ph $t9, $t6, $t3 \n" \
- "addu.ph $t8, $t6, $t0 \n" \
- "shra.ph $t9, $t9, 6 \n" \
- "shra.ph $t8, $t8, 6 \n" \
- "addu.ph $t2, $t1, $t3 \n" \
- "addu.ph $t1, $t1, $t0 \n" \
- "shra.ph $t2, $t2, 6 \n" \
- "shra.ph $t1, $t1, 6 \n" \
- "subu.ph $t5, $t5, $s5 \n" \
- "subu.ph $t4, $t4, $s5 \n" \
- "subu.ph $t9, $t9, $s5 \n" \
- "subu.ph $t8, $t8, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "shll_s.ph $t5, $t5, 8 \n" \
- "shll_s.ph $t4, $t4, 8 \n" \
- "shll_s.ph $t9, $t9, 8 \n" \
- "shll_s.ph $t8, $t8, 8 \n" \
- "shll_s.ph $t2, $t2, 8 \n" \
- "shll_s.ph $t1, $t1, 8 \n" \
- "shra.ph $t5, $t5, 8 \n" \
- "shra.ph $t4, $t4, 8 \n" \
- "shra.ph $t9, $t9, 8 \n" \
- "shra.ph $t8, $t8, 8 \n" \
- "shra.ph $t2, $t2, 8 \n" \
- "shra.ph $t1, $t1, 8 \n" \
- "addu.ph $t5, $t5, $s5 \n" \
- "addu.ph $t4, $t4, $s5 \n" \
- "addu.ph $t9, $t9, $s5 \n" \
- "addu.ph $t8, $t8, $s5 \n" \
- "addu.ph $t2, $t2, $s5 \n" \
- "addu.ph $t1, $t1, $s5 \n"
-
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
- "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
- "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
- "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
- "repl.ph $s4, 16 \n" // |0|16|0|16|
- "repl.ph $s5, 128 \n" // |128|128| // clipping
- "lui $s6, 0xff00 \n"
- "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
-
- "1: \n"
- YUVTORGB
-// Arranging into argb format
- "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
- "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
- "addiu %[width], -4 \n"
- "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
- "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
- "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
-
- "addiu %[y_buf], 4 \n"
- "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
- "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
- "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
- "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
- "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
- "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
- "sll $t9, $t9, 16 \n"
- "sll $t8, $t8, 16 \n"
- "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
- "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
-// Store results.
- "sw $t2, 0(%[rgb_buf]) \n"
- "sw $t0, 4(%[rgb_buf]) \n"
- "sw $t1, 8(%[rgb_buf]) \n"
- "sw $t3, 12(%[rgb_buf]) \n"
- "bnez %[width], 1b \n"
- " addiu %[rgb_buf], 16 \n"
- "2: \n"
- ".set pop \n"
- :[y_buf] "+r" (y_buf),
- [u_buf] "+r" (u_buf),
- [v_buf] "+r" (v_buf),
- [width] "+r" (width),
- [rgb_buf] "+r" (rgb_buf)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9",
- "s0", "s1", "s2", "s3",
- "s4", "s5", "s6"
- );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- int y0_fraction = 256 - source_y_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
-
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "replv.ph $t0, %[y0_fraction] \n"
- "replv.ph $t1, %[source_y_fraction] \n"
-
- "1: \n"
- "lw $t2, 0(%[src_ptr]) \n"
- "lw $t3, 0(%[src_ptr1]) \n"
- "lw $t4, 4(%[src_ptr]) \n"
- "lw $t5, 4(%[src_ptr1]) \n"
- "muleu_s.ph.qbl $t6, $t2, $t0 \n"
- "muleu_s.ph.qbr $t7, $t2, $t0 \n"
- "muleu_s.ph.qbl $t8, $t3, $t1 \n"
- "muleu_s.ph.qbr $t9, $t3, $t1 \n"
- "muleu_s.ph.qbl $t2, $t4, $t0 \n"
- "muleu_s.ph.qbr $t3, $t4, $t0 \n"
- "muleu_s.ph.qbl $t4, $t5, $t1 \n"
- "muleu_s.ph.qbr $t5, $t5, $t1 \n"
- "addq.ph $t6, $t6, $t8 \n"
- "addq.ph $t7, $t7, $t9 \n"
- "addq.ph $t2, $t2, $t4 \n"
- "addq.ph $t3, $t3, $t5 \n"
- "shra.ph $t6, $t6, 8 \n"
- "shra.ph $t7, $t7, 8 \n"
- "shra.ph $t2, $t2, 8 \n"
- "shra.ph $t3, $t3, 8 \n"
- "precr.qb.ph $t6, $t6, $t7 \n"
- "precr.qb.ph $t2, $t2, $t3 \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[src_ptr1], %[src_ptr1], 8 \n"
- "addiu %[dst_width], %[dst_width], -8 \n"
- "sw $t6, 0(%[dst_ptr]) \n"
- "sw $t2, 4(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[dst_ptr], %[dst_ptr], 8 \n"
-
- ".set pop \n"
- : [dst_ptr] "+r" (dst_ptr),
- [src_ptr1] "+r" (src_ptr1),
- [src_ptr] "+r" (src_ptr),
- [dst_width] "+r" (dst_width)
- : [source_y_fraction] "r" (source_y_fraction),
- [y0_fraction] "r" (y0_fraction),
- [src_stride] "r" (src_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
-}
-#endif // __mips_dsp_rev >= 2
-
-#endif // defined(__mips__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc
new file mode 100644
index 00000000..f79de1c7
--- /dev/null
+++ b/files/source/row_msa.cc
@@ -0,0 +1,2977 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+ { \
+ ub = __msa_fill_w(yuvconst->kUVToB[0]); \
+ vr = __msa_fill_w(yuvconst->kUVToR[1]); \
+ ug = __msa_fill_w(yuvconst->kUVToG[0]); \
+ vg = __msa_fill_w(yuvconst->kUVToG[1]); \
+ bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
+ bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
+ br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
+ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+ }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+ { \
+ uint64 y_m; \
+ uint32 u_m, v_m; \
+ v4i32 zero_m = {0}; \
+ y_m = LD(psrc_y); \
+ u_m = LW(psrc_u); \
+ v_m = LW(psrc_v); \
+ out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
+ out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
+ out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
+ }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+ { \
+ v4i32 max_m = __msa_ldi_w(0xFF); \
+ \
+ in0 = __msa_maxi_s_w(in0, 0); \
+ in1 = __msa_maxi_s_w(in1, 0); \
+ in2 = __msa_maxi_s_w(in2, 0); \
+ in3 = __msa_maxi_s_w(in3, 0); \
+ in4 = __msa_maxi_s_w(in4, 0); \
+ in5 = __msa_maxi_s_w(in5, 0); \
+ in0 = __msa_min_s_w(max_m, in0); \
+ in1 = __msa_min_s_w(max_m, in1); \
+ in2 = __msa_min_s_w(max_m, in2); \
+ in3 = __msa_min_s_w(max_m, in3); \
+ in4 = __msa_min_s_w(max_m, in4); \
+ in5 = __msa_min_s_w(max_m, in5); \
+ }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
+ v4i32 reg5_m, reg6_m, reg7_m; \
+ v16i8 zero_m = {0}; \
+ \
+ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
+ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
+ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
+ reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
+ reg0_m *= yg; \
+ reg1_m *= yg; \
+ reg2_m *= ubvr; \
+ reg3_m *= ubvr; \
+ reg0_m = __msa_srai_w(reg0_m, 16); \
+ reg1_m = __msa_srai_w(reg1_m, 16); \
+ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
+ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
+ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
+ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
+ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
+ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
+ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
+ reg5_m = reg0_m - reg5_m; \
+ reg6_m = reg1_m - reg6_m; \
+ reg2_m = reg0_m - reg2_m; \
+ reg3_m = reg1_m - reg3_m; \
+ reg7_m = reg0_m - reg7_m; \
+ reg4_m = reg1_m - reg4_m; \
+ reg5_m += bb; \
+ reg6_m += bb; \
+ reg7_m += bg; \
+ reg4_m += bg; \
+ reg2_m += br; \
+ reg3_m += br; \
+ reg5_m = __msa_srai_w(reg5_m, 6); \
+ reg6_m = __msa_srai_w(reg6_m, 6); \
+ reg7_m = __msa_srai_w(reg7_m, 6); \
+ reg4_m = __msa_srai_w(reg4_m, 6); \
+ reg2_m = __msa_srai_w(reg2_m, 6); \
+ reg3_m = __msa_srai_w(reg3_m, 6); \
+ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
+ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
+ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
+ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v16u8 dst0_m, dst1_m; \
+ vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
+ dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
+ ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
+ }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+ y_out) \
+ { \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8u16 reg0_m, reg1_m; \
+ \
+ vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
+ vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
+ vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
+ vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
+ reg0_m = __msa_dotp_u_h(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_h(vec1_m, const0); \
+ reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
+ reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
+ reg0_m += const2; \
+ reg1_m += const2; \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
+ y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+ { \
+ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v16u8 vec8_m, vec9_m; \
+ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+ v8u16 reg8_m, reg9_m; \
+ \
+ src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \
+ src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \
+ src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \
+ src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \
+ src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \
+ src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \
+ src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \
+ src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \
+ vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
+ vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
+ reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
+ reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
+ reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
+ reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
+ reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
+ reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
+ reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
+ reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
+ reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
+ reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
+ argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
+ argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \
+ src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \
+ src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \
+ src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \
+ src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \
+ src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \
+ src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \
+ src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \
+ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
+ vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
+ vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
+ vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
+ vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
+ reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
+ reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
+ reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
+ reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
+ reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
+ reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
+ reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
+ reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
+ reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
+ reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
+ reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
+ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
+ reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
+ reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
+ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
+ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
+ argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
+ argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_h(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_h(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_h(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_h(vec5_m, const1); \
+ reg0_m += const3; \
+ reg1_m += const3; \
+ reg2_m += const3; \
+ reg3_m += const3; \
+ reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+ }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+ { \
+ uint64 y_m, u_m, v_m; \
+ v2i64 zero_m = {0}; \
+ y_m = LD(psrc_y); \
+ u_m = LD(psrc_u); \
+ v_m = LD(psrc_v); \
+ out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \
+ out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \
+ out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \
+ }
+
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+ src += width - 64;
+
+ for (x = 0; x < width; x += 64) {
+ LD_UB4(src, 16, src3, src2, src1, src0);
+ VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+ VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+ src += width * 4 - 64;
+
+ for (x = 0; x < width; x += 16) {
+ LD_UB4(src, 16, src3, src2, src1, src0);
+ VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+ VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void I422ToYUY2Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2,
+ int width) {
+ int x;
+ v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+ v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+ for (x = 0; x < width; x += 32) {
+ src_u0 = LD_UB(src_u);
+ src_v0 = LD_UB(src_v);
+ LD_UB2(src_y, 16, src_y0, src_y1);
+ ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+ ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+ ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+ ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_yuy2 += 64;
+ }
+}
+
+void I422ToUYVYRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy,
+ int width) {
+ int x;
+ v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+ v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+ for (x = 0; x < width; x += 32) {
+ src_u0 = LD_UB(src_u);
+ src_v0 = LD_UB(src_v);
+ LD_UB2(src_y, 16, src_y0, src_y1);
+ ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+ ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+ ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+ ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_uyvy += 64;
+ }
+}
+
+void I422ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
+void I422ToRGBARow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ rgb_buf += 32;
+ }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ const uint8* src_a,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int64 data_a;
+ v16u8 src0, src1, src2, src3;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v4i32 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ data_a = LD(src_a);
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+ STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ src_a += 8;
+ rgb_buf += 32;
+ }
+}
+
+void I422ToRGB24Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int32 width) {
+ int x;
+ int64 data_u, data_v;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 reg0, reg1, reg2, reg3;
+ v2i64 zero = {0};
+ v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+ v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+ v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
+ 11, 29, 12, 13, 30, 14, 15, 31};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+ data_u = LD(src_u);
+ data_v = LD(src_v);
+ src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+ src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+ src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec3, vec4, vec5);
+ reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+ reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+ reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ ST_UB(dst2, (rgb_buf + 32));
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ rgb_buf += 48;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec2, vec1);
+ vec0 = __msa_srai_h(vec0, 3);
+ vec1 = __msa_srai_h(vec1, 3);
+ vec2 = __msa_srai_h(vec2, 2);
+ vec1 = __msa_slli_h(vec1, 11);
+ vec2 = __msa_slli_h(vec2, 5);
+ vec0 |= vec1;
+ dst0 = (v16u8)(vec2 | vec0);
+ ST_UB(dst0, dst_rgb565);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_rgb565 += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v8u16 reg0, reg1, reg2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srai_h(vec0, 4);
+ reg1 = (v8u16)__msa_srai_h(vec1, 4);
+ reg2 = (v8u16)__msa_srai_h(vec2, 4);
+ reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+ reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+ reg1 |= const_0xF000;
+ reg0 |= reg2;
+ dst0 = (v16u8)(reg1 | reg0);
+ ST_UB(dst0, dst_argb4444);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_argb4444 += 16;
+ }
+}
+
+void I422ToARGB1555Row_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0;
+ v8i16 vec0, vec1, vec2;
+ v8u16 reg0, reg1, reg2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ READYUV422(src_y, src_u, src_v, src0, src1, src2);
+ src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srai_h(vec0, 3);
+ reg1 = (v8u16)__msa_srai_h(vec1, 3);
+ reg2 = (v8u16)__msa_srai_h(vec2, 3);
+ reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+ reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+ reg1 |= const_0x8000;
+ reg0 |= reg2;
+ dst0 = (v16u8)(reg1 | reg0);
+ ST_UB(dst0, dst_argb1555);
+ src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ dst_argb1555 += 16;
+ }
+}
+
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_yuy2 += 64;
+ dst_y += 32;
+ }
+}
+
+void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+ int src_stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+ src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec0 = __msa_aver_u_b(src0, src2);
+ vec1 = __msa_aver_u_b(src1, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_yuy2 += 64;
+ src_yuy2_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+ src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_yuy2 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_uyvy += 64;
+ dst_y += 32;
+ }
+}
+
+void UYVYToUVRow_MSA(const uint8* src_uyvy,
+ int src_stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+ src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec0 = __msa_aver_u_b(src0, src2);
+ vec1 = __msa_aver_u_b(src1, src3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_uyvy += 64;
+ src_uyvy_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+ src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_uyvy += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16i8 zero = {0};
+ v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+ v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+ v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+ reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+ reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+ reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+ reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+ reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+ reg0 *= const_0x19;
+ reg1 *= const_0x19;
+ reg2 *= const_0x81;
+ reg3 *= const_0x81;
+ reg4 *= const_0x42;
+ reg5 *= const_0x42;
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 += reg4;
+ reg1 += reg5;
+ reg0 += const_0x1080;
+ reg1 += const_0x1080;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ARGBToUVRow_MSA(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* src_argb0_next = src_argb0 + src_stride_argb;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+ v16u8 dst0, dst1;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+ reg0 = __msa_hadd_u_h(vec8, vec8);
+ reg1 = __msa_hadd_u_h(vec9, vec9);
+ reg2 = __msa_hadd_u_h(vec4, vec4);
+ reg3 = __msa_hadd_u_h(vec5, vec5);
+ reg4 = __msa_hadd_u_h(vec0, vec0);
+ reg5 = __msa_hadd_u_h(vec1, vec1);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+ vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+ vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+ vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+ vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+ reg0 += __msa_hadd_u_h(vec8, vec8);
+ reg1 += __msa_hadd_u_h(vec9, vec9);
+ reg2 += __msa_hadd_u_h(vec4, vec4);
+ reg3 += __msa_hadd_u_h(vec5, vec5);
+ reg4 += __msa_hadd_u_h(vec0, vec0);
+ reg5 += __msa_hadd_u_h(vec1, vec1);
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg6 = reg0 * const_0x70;
+ reg7 = reg1 * const_0x70;
+ reg8 = reg2 * const_0x4A;
+ reg9 = reg3 * const_0x4A;
+ reg6 += const_0x8080;
+ reg7 += const_0x8080;
+ reg8 += reg4 * const_0x26;
+ reg9 += reg5 * const_0x26;
+ reg0 *= const_0x12;
+ reg1 *= const_0x12;
+ reg2 *= const_0x5E;
+ reg3 *= const_0x5E;
+ reg4 *= const_0x70;
+ reg5 *= const_0x70;
+ reg2 += reg0;
+ reg3 += reg1;
+ reg4 += const_0x8080;
+ reg5 += const_0x8080;
+ reg6 -= reg8;
+ reg7 -= reg9;
+ reg4 -= reg2;
+ reg5 -= reg3;
+ reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+ reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_argb0 += 128;
+ src_argb0_next += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+ v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+ v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14,
+ 16, 17, 18, 20, 21, 22, 24, 25};
+ v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+ 21, 22, 24, 25, 26, 28, 29, 30};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_rgb, 16);
+ ST_UB(dst2, (dst_rgb + 32));
+ src_argb += 64;
+ dst_rgb += 48;
+ }
+}
+
+void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+ v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+ v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12,
+ 18, 17, 16, 22, 21, 20, 26, 25};
+ v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22,
+ 21, 20, 26, 25, 24, 30, 29, 28};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_rgb, 16);
+ ST_UB(dst2, (dst_rgb + 32));
+ src_argb += 64;
+ dst_rgb += 48;
+ }
+}
+
+void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+ vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+ vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+ vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+ vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+ vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+ vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+ vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+ vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+ vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+ vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+ vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+ vec0 = __msa_binsli_b(vec0, vec1, 2);
+ vec1 = __msa_binsli_b(vec2, vec3, 4);
+ vec4 = __msa_binsli_b(vec4, vec5, 2);
+ vec5 = __msa_binsli_b(vec6, vec7, 4);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+ dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+ vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+ vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+ vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+ vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+ vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+ vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+ vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+ vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+ vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+ vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+ vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+ vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+ vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+ vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+ vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+ vec0 = __msa_binsli_b(vec0, vec1, 2);
+ vec5 = __msa_binsli_b(vec5, vec6, 2);
+ vec1 = __msa_binsli_b(vec2, vec3, 5);
+ vec6 = __msa_binsli_b(vec7, vec8, 5);
+ vec1 = __msa_binsli_b(vec1, vec4, 0);
+ vec6 = __msa_binsli_b(vec6, vec9, 0);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+ dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ v16u8 src0, src1;
+ v16u8 vec0, vec1;
+ v16u8 dst0;
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+ vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+ src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+ src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+ vec0 = __msa_binsli_b(vec0, src0, 3);
+ vec1 = __msa_binsli_b(vec1, src1, 3);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBToUV444Row_MSA(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int32 width) {
+ int32 x;
+ v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 vec8, vec9, vec10, vec11;
+ v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+ v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+ v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+ v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+ v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+ v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+ v16i8 zero = {0};
+
+ for (x = width; x > 0; x -= 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+ src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+ vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+ vec10 = vec0 * const_18;
+ vec11 = vec1 * const_18;
+ vec8 = vec2 * const_94;
+ vec9 = vec3 * const_94;
+ vec6 = vec4 * const_112;
+ vec7 = vec5 * const_112;
+ vec0 *= const_112;
+ vec1 *= const_112;
+ vec2 *= const_74;
+ vec3 *= const_74;
+ vec4 *= const_38;
+ vec5 *= const_38;
+ vec8 += vec10;
+ vec9 += vec11;
+ vec6 += const_32896;
+ vec7 += const_32896;
+ vec0 += const_32896;
+ vec1 += const_32896;
+ vec2 += vec4;
+ vec3 += vec5;
+ vec0 -= vec2;
+ vec1 -= vec3;
+ vec6 -= vec8;
+ vec7 -= vec9;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+ vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ src_argb += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBMultiplyRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4u32 reg0, reg1, reg2, reg3;
+ v8i16 zero = {0};
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+ reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+ reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+ reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_argb);
+ src_argb0 += 16;
+ src_argb1 += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBAddRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ dst0 = __msa_adds_u_b(src0, src2);
+ dst1 = __msa_adds_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSubtractRow_MSA(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ dst0 = __msa_subs_u_b(src0, src2);
+ dst1 = __msa_subs_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v8i16 zero = {0};
+ v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+ vec4 = (v8u16)__msa_fill_h(vec0[3]);
+ vec5 = (v8u16)__msa_fill_h(vec0[7]);
+ vec6 = (v8u16)__msa_fill_h(vec1[3]);
+ vec7 = (v8u16)__msa_fill_h(vec1[7]);
+ vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ vec6 = (v8u16)__msa_fill_h(vec2[3]);
+ vec7 = (v8u16)__msa_fill_h(vec2[7]);
+ vec8 = (v8u16)__msa_fill_h(vec3[3]);
+ vec9 = (v8u16)__msa_fill_h(vec3[7]);
+ vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+ reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+ reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+ reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+ reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+ reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+ reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+ reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+ reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+ reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+ reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+ reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+ reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst0 = __msa_bmnz_v(dst0, src0, mask);
+ dst1 = __msa_bmnz_v(dst1, src1, mask);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
+ uint8* dst_rgb,
+ uint32 dither4,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, vec0, vec1;
+ v8i16 vec_d0;
+ v8i16 reg0, reg1, reg2;
+ v16i8 zero = {0};
+ v8i16 max = __msa_ldi_h(0xFF);
+
+ vec_d0 = (v8i16)__msa_fill_w(dither4);
+ vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+ reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+ reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+ reg0 += vec_d0;
+ reg1 += vec_d0;
+ reg2 += vec_d0;
+ reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+ reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+ reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+ reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+ reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+ reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+ reg0 = __msa_srai_h(reg0, 3);
+ reg2 = __msa_srai_h(reg2, 3);
+ reg1 = __msa_srai_h(reg1, 2);
+ reg2 = __msa_slli_h(reg2, 11);
+ reg1 = __msa_slli_h(reg1, 5);
+ reg0 |= reg1;
+ dst0 = (v16u8)(reg0 | reg2);
+ ST_UB(dst0, dst_rgb);
+ src_argb += 32;
+ dst_rgb += 16;
+ }
+}
+
+void ARGBShuffleRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+ v16i8 vec0;
+ v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+ int32 val = LW((int32*)shuffler);
+
+ vec0 = (v16i8)__msa_fill_w(val);
+ shuffler_vec += vec0;
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBShadeRow_MSA(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
+ int x;
+ v16u8 src0, dst0;
+ v8u16 vec0, vec1;
+ v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+ v8i16 zero = {0};
+
+ rgba_scale[0] = value;
+ rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+ rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+ reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+ reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+ reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+ reg0 *= rgba_scale;
+ reg1 *= rgba_scale;
+ reg2 *= rgba_scale;
+ reg3 *= rgba_scale;
+ reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+ reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+ reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+ reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_argb);
+ src_argb += 16;
+ dst_argb += 16;
+ }
+}
+
+void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, dst0, dst1;
+ v8u16 reg0;
+ v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+ v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+ vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+ reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+ vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2;
+ v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+ v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+ v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+ v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+ v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+ v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+ v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+ vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+ vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+ reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+ reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+ reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+ reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+ reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+ reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+ reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+ reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+ vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1;
+ v8u16 vec0, vec1, vec2, vec3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
+ vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+ vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+ vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+ vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+ vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+ vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+ vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+ vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_argb4444 += 32;
+ dst_argb += 64;
+ }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v8u16 src0, src1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+ reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+ reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+ reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+ reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+ reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+ reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+ reg3 = -reg3;
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+ reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+ reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_argb1555 += 32;
+ dst_argb += 64;
+ }
+}
+
+void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+ v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src0 & const_0x7E0;
+ vec2 = src0 & const_0xF800;
+ vec3 = src1 & const_0x1F;
+ vec4 = src1 & const_0x7E0;
+ vec5 = src1 & const_0xF800;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+ reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+ reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+ reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+ reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+ reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+ reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+ res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+ res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_rgb565 += 32;
+ dst_argb += 64;
+ }
+}
+
+void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v16u8 vec0, vec1, vec2;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
+ vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+ vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+ dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+ dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_rgb24 += 48;
+ dst_argb += 64;
+ }
+}
+
+void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v16u8 vec0, vec1, vec2;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+ vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+ dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+ dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_raw += 48;
+ dst_argb += 64;
+ }
+}
+
+void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v16u8 dst0;
+ v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+ v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+ v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+ reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+ reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+ reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+ reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+ reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+ reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+ reg0 *= const_0x19;
+ reg1 *= const_0x19;
+ reg2 *= const_0x81;
+ reg3 *= const_0x81;
+ reg4 *= const_0x42;
+ reg5 *= const_0x42;
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 += reg4;
+ reg1 += reg5;
+ reg0 += const_0x1080;
+ reg1 += const_0x1080;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_y);
+ src_argb1555 += 32;
+ dst_y += 16;
+ }
+}
+
+void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
+ int x;
+ v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v4u32 res0, res1, res2, res3;
+ v16u8 dst0;
+ v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+ v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+ v8i16 const_0x1080 = __msa_fill_h(0x1080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+ v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src0 & const_0x7E0;
+ vec2 = src0 & const_0xF800;
+ vec3 = src1 & const_0x1F;
+ vec4 = src1 & const_0x7E0;
+ vec5 = src1 & const_0xF800;
+ reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+ reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+ reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+ reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+ reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+ reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+ reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+ reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+ reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+ reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+ reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+ reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+ vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+ vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+ vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+ vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+ vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+ vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+ vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+ res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+ res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+ res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+ res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+ res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+ res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+ res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+ res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+ res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+ res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+ res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+ res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_rgb565 += 32;
+ dst_y += 16;
+ }
+}
+
+void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+ v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+ v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+ 18, 19, 20, 21, 21, 22, 23, 24};
+ v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+ v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+ reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+ reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+ vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+ vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+ vec0 += const_0x1080;
+ vec1 += const_0x1080;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 48;
+ dst_y += 16;
+ }
+}
+
+void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+ v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+ v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+ 18, 19, 20, 21, 21, 22, 23, 24};
+ v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+ v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+ reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+ reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+ vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+ vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+ vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+ vec0 += const_0x1080;
+ vec1 += const_0x1080;
+ vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 48;
+ dst_y += 16;
+ }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint16* s = (const uint16*)src_argb1555;
+ const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
+ int64_t res0, res1;
+ v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+ src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+ src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ vec0 += src2 & const_0x1F;
+ vec1 += src3 & const_0x1F;
+ vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec2 = src0 & const_0x1F;
+ vec3 = src1 & const_0x1F;
+ vec2 += src2 & const_0x1F;
+ vec3 += src3 & const_0x1F;
+ vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ vec4 += src2 & const_0x1F;
+ vec5 += src3 & const_0x1F;
+ vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+ vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+ vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+ vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+ vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+ vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+ reg0 = vec6 * const_0x70;
+ reg1 = vec0 * const_0x4A;
+ reg2 = vec2 * const_0x70;
+ reg3 = vec0 * const_0x5E;
+ reg0 += const_0x8080;
+ reg1 += vec2 * const_0x26;
+ reg2 += const_0x8080;
+ reg3 += vec6 * const_0x12;
+ reg0 -= reg1;
+ reg2 -= reg3;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ s += 16;
+ t += 16;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint16* s = (const uint16*)src_rgb565;
+ const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
+ int64_t res0, res1;
+ v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+ src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+ src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+ src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ vec0 = src0 & const_0x1F;
+ vec1 = src1 & const_0x1F;
+ vec0 += src2 & const_0x1F;
+ vec1 += src3 & const_0x1F;
+ vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+ vec2 = src0 & const_0x3F;
+ vec3 = src1 & const_0x3F;
+ vec2 += src2 & const_0x3F;
+ vec3 += src3 & const_0x3F;
+ vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+ src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+ src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+ src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+ vec4 = src0 & const_0x1F;
+ vec5 = src1 & const_0x1F;
+ vec4 += src2 & const_0x1F;
+ vec5 += src3 & const_0x1F;
+ vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+ vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+ vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+ vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+ reg0 = vec3 * const_0x70;
+ reg1 = vec1 * const_0x4A;
+ reg2 = vec4 * const_0x70;
+ reg3 = vec1 * const_0x5E;
+ reg0 += const_32896;
+ reg1 += vec4 * const_0x26;
+ reg2 += const_32896;
+ reg3 += vec3 * const_0x12;
+ reg0 -= reg1;
+ reg2 -= reg3;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ s += 16;
+ t += 16;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RGB24ToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ int64 res0, res1;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 reg0, reg1, reg2, reg3;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+ src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+ src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+ src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+ src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+ src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+ src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+ src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+ src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+ src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+ src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+ src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+ src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+ src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+ vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+ reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+ reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+ reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+ reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+ reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+ reg0 = __msa_srai_h((v8i16)reg0, 2);
+ reg1 = __msa_srai_h((v8i16)reg1, 2);
+ reg2 = __msa_srai_h((v8i16)reg2, 2);
+ reg3 = __msa_srai_h((v8i16)reg3, 2);
+ vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+ vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+ vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+ vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+ vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+ vec3 = vec0 * const_0x70;
+ vec4 = vec1 * const_0x4A;
+ vec5 = vec2 * const_0x26;
+ vec2 *= const_0x70;
+ vec1 *= const_0x5E;
+ vec0 *= const_0x12;
+ reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+ reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+ reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+ reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+ reg0 += reg1;
+ reg2 += reg3;
+ reg0 = __msa_srai_h(reg0, 8);
+ reg2 = __msa_srai_h(reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ t += 48;
+ s += 48;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void RAWToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ int64 res0, res1;
+ v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 reg0, reg1, reg2, reg3;
+ v16u8 dst0;
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+ src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+ src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+ src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+ src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+ src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+ src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+ src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+ src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+ src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+ src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+ src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+ src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+ src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+ vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+ reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+ reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+ reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+ reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+ reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+ reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+ reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+ reg0 = __msa_srai_h(reg0, 2);
+ reg1 = __msa_srai_h(reg1, 2);
+ reg2 = __msa_srai_h(reg2, 2);
+ reg3 = __msa_srai_h(reg3, 2);
+ vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+ vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+ vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+ vec3 = vec0 * const_0x70;
+ vec4 = vec1 * const_0x4A;
+ vec5 = vec2 * const_0x26;
+ vec2 *= const_0x70;
+ vec1 *= const_0x5E;
+ vec0 *= const_0x12;
+ reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+ reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+ reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+ reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+ reg0 += reg1;
+ reg2 += reg3;
+ reg0 = __msa_srai_h(reg0, 8);
+ reg2 = __msa_srai_h(reg2, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+ res0 = __msa_copy_u_d((v2i64)dst0, 0);
+ res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ SD(res0, dst_u);
+ SD(res1, dst_v);
+ t += 48;
+ s += 48;
+ dst_u += 8;
+ dst_v += 8;
+ }
+}
+
+void NV12ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, res0, res1, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 zero = {0};
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 32;
+ }
+}
+
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, dst0;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_uv);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ vec0 = vec0 >> 3;
+ vec1 = (vec1 >> 2) << 5;
+ vec2 = (vec2 >> 3) << 11;
+ dst0 = (v16u8)(vec0 | vec1 | vec2);
+ ST_UB(dst0, rgb_buf);
+ src_y += 8;
+ src_uv += 8;
+ rgb_buf += 16;
+ }
+}
+
+void NV21ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ uint64 val0, val1;
+ v16u8 src0, src1, res0, res1, dst0, dst1;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v16u8 zero = {0};
+ v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ val0 = LD(src_y);
+ val1 = LD(src_vu);
+ src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+ src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+ src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_vu += 8;
+ rgb_buf += 32;
+ }
+}
+
+void SobelRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+ v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+ v16i8 const_0x4 = __msa_ldi_b(0x4);
+ v16i8 mask1 = mask0 + const_0x4;
+ v16i8 mask2 = mask1 + const_0x4;
+ v16i8 mask3 = mask2 + const_0x4;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+ dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+ dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+ dst0 = __msa_adds_u_b(src0, src2);
+ dst1 = __msa_adds_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_y, 16);
+ src_sobelx += 32;
+ src_sobely += 32;
+ dst_y += 32;
+ }
+}
+
+void SobelXYRow_MSA(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, vec2;
+ v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ vec0 = __msa_adds_u_b(src0, src1);
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+ reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+ reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+ v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+ v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+ v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0;
+ v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+ v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+ v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+ dst0);
+ ST_UB(dst0, dst_y);
+ src_argb0 += 64;
+ dst_y += 16;
+ }
+}
+
+void ARGBToUVJRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 vec0, vec1, vec2, vec3;
+ v16u8 dst0, dst1;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+ v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+ v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+ v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ src0 = __msa_aver_u_b(src0, src4);
+ src1 = __msa_aver_u_b(src1, src5);
+ src2 = __msa_aver_u_b(src2, src6);
+ src3 = __msa_aver_u_b(src3, src7);
+ src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+ src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+ vec0 = __msa_aver_u_b(src4, src6);
+ vec1 = __msa_aver_u_b(src5, src7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+ src0 = __msa_aver_u_b(src0, src4);
+ src1 = __msa_aver_u_b(src1, src5);
+ src2 = __msa_aver_u_b(src2, src6);
+ src3 = __msa_aver_u_b(src3, src7);
+ src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+ src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+ vec2 = __msa_aver_u_b(src4, src6);
+ vec3 = __msa_aver_u_b(src5, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+ const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_v);
+ ST_UB(dst1, dst_u);
+ s += 128;
+ t += 128;
+ dst_v += 16;
+ dst_u += 16;
+ }
+}
+
+void BGRAToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+ v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+ v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+ const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_v);
+ ST_UB(dst1, dst_u);
+ s += 128;
+ t += 128;
+ dst_v += 16;
+ dst_u += 16;
+ }
+}
+
+void ABGRToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+ v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+ v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, src0, src1, src2, src3);
+ ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+ const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ s += 128;
+ t += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RGBAToUVRow_MSA(const uint8* src_rgb0,
+ int src_stride_rgb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
+ int x;
+ const uint8* s = src_rgb0;
+ const uint8* t = src_rgb0 + src_stride_rgb;
+ v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+ v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+ v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+ v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+ v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+ v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+ v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+ v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+ for (x = 0; x < width; x += 32) {
+ READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+ const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+ dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
+ s += 128;
+ t += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void I444ToARGBRow_MSA(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, dst0, dst1;
+ v8u16 vec0, vec1, vec2;
+ v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 zero = {0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+
+ for (x = 0; x < width; x += 8) {
+ READI444(src_y, src_u, src_v, src0, src1, src2);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg4 = reg0 + vec_br;
+ reg5 = reg1 + vec_br;
+ reg2 = reg0 + vec_bg;
+ reg3 = reg1 + vec_bg;
+ reg0 += vec_bb;
+ reg1 += vec_bb;
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+ reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+ reg0 -= reg6 * vec_ub;
+ reg1 -= reg7 * vec_ub;
+ reg2 -= reg6 * vec_ug;
+ reg3 -= reg7 * vec_ug;
+ reg4 -= reg8 * vec_vr;
+ reg5 -= reg9 * vec_vr;
+ reg2 -= reg8 * vec_vg;
+ reg3 -= reg9 * vec_vg;
+ reg0 = __msa_srai_w(reg0, 6);
+ reg1 = __msa_srai_w(reg1, 6);
+ reg2 = __msa_srai_w(reg2, 6);
+ reg3 = __msa_srai_w(reg3, 6);
+ reg4 = __msa_srai_w(reg4, 6);
+ reg5 = __msa_srai_w(reg5, 6);
+ CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+ vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+ dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+ dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+ ST_UB2(dst0, dst1, rgb_buf, 16);
+ src_y += 8;
+ src_u += 8;
+ src_v += 8;
+ rgb_buf += 32;
+ }
+}
+
+void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
+ int x;
+ v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+ v8i16 vec0, vec1;
+ v4i32 reg0, reg1, reg2, reg3;
+ v4i32 vec_yg = __msa_fill_w(0x4A35);
+ v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 max = __msa_ldi_h(0xFF);
+ v8i16 zero = {0};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+ reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+ reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+ reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+ reg0 *= vec_yg;
+ reg1 *= vec_yg;
+ reg2 *= vec_yg;
+ reg3 *= vec_yg;
+ reg0 = __msa_srai_w(reg0, 16);
+ reg1 = __msa_srai_w(reg1, 16);
+ reg2 = __msa_srai_w(reg2, 16);
+ reg3 = __msa_srai_w(reg3, 16);
+ vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ vec0 += vec_ygb;
+ vec1 += vec_ygb;
+ vec0 = __msa_srai_h(vec0, 6);
+ vec1 = __msa_srai_h(vec1, 6);
+ vec0 = __msa_maxi_s_h(vec0, 0);
+ vec1 = __msa_maxi_s_h(vec1, 0);
+ vec0 = __msa_min_s_h(max, vec0);
+ vec1 = __msa_min_s_h(max, vec1);
+ res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+ res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+ res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+ res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+ ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
+ src_y += 16;
+ rgb_buf += 64;
+ }
+}
+
+void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
+ int x;
+ v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+ vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+ vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+ dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+ dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ src_y += 16;
+ dst_argb += 64;
+ }
+}
+
+void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
+ src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+ src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_yuy2 += 16;
+ rgb_buf += 32;
+ }
+}
+
+void UYVYToARGBRow_MSA(const uint8* src_uyvy,
+ uint8* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2;
+ v8i16 vec0, vec1, vec2;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ubvr, vec_ugvg;
+ v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+ vec_br, vec_yg);
+ vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+ vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
+ src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+ src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+ vec0, vec1, vec2);
+ STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ src_uyvy += 16;
+ rgb_buf += 32;
+ }
+}
+
+void InterpolateRow_MSA(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int32 source_y_fraction) {
+ int32 y1_fraction = source_y_fraction;
+ int32 y0_fraction = 256 - y1_fraction;
+ uint16 y_fractions;
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+ if (0 == y1_fraction) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+
+ if (128 == y1_fraction) {
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ dst0 = __msa_aver_u_b(src0, src2);
+ dst1 = __msa_aver_u_b(src1, src3);
+ ST_UB2(dst0, dst1, dst_ptr, 16);
+ s += 32;
+ t += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
+ y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+ vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+ vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+ vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+ vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+ vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+ vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+ vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ ST_UB2(dst0, dst1, dst_ptr, 16);
+ s += 32;
+ t += 32;
+ dst_ptr += 32;
+ }
+}
+
+void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
+ int x;
+ v16u8 dst0 = (v16u8)__msa_fill_w(v32);
+
+ for (x = 0; x < width; x += 4) {
+ ST_UB(dst0, dst_argb);
+ dst_argb += 16;
+ }
+}
+
+void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+ v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+ v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13,
+ 18, 17, 16, 21, 20, 19, 24, 23};
+ v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+ 24, 23, 28, 27, 26, 31, 30, 29};
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+ src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+ dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+ dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+ ST_UB2(dst0, dst1, dst_rgb24, 16);
+ ST_UB(dst2, (dst_rgb24 + 32));
+ src_raw += 48;
+ dst_rgb24 += 48;
+ }
+}
+
+void MergeUVRow_MSA(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
+ int x;
+ v16u8 src0, src1, dst0, dst1;
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
+ dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+ ST_UB2(dst0, dst1, dst_uv, 16);
+ src_u += 16;
+ src_v += 16;
+ dst_uv += 32;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index 909df060..bed14e07 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -10,6 +10,8 @@
#include "libyuv/row.h"
+#include <stdio.h>
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -20,29 +22,18 @@ extern "C" {
!defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+#define READYUV422 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \
MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n"
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.16 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.16 {d2[1]}, [%2]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vzip.u8 d2, d3 \n"
-
// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+#define READYUV444 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
MEMACCESS(2) \
@@ -51,15 +42,15 @@ extern "C" {
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
+#define READYUV400 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+#define READNV12 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
@@ -67,9 +58,9 @@ extern "C" {
"vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
+#define READNV21 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
@@ -77,25 +68,25 @@ extern "C" {
"vtrn.u32 d2, d3 \n"
// Read 8 YUY2
-#define READYUY2 \
- MEMACCESS(0) \
- "vld2.8 {d0, d2}, [%0]! \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+#define READYUY2 \
+ MEMACCESS(0) \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
// Read 8 UYVY
-#define READUYVY \
- MEMACCESS(0) \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
- "vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUVTORGB_SETUP \
- MEMACCESS([kUVToRB]) \
- "vld1.8 {d24}, [%[kUVToRB]] \n" \
+#define READUYVY \
+ MEMACCESS(0) \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUVTORGB_SETUP \
+ MEMACCESS([kUVToRB]) \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
MEMACCESS([kUVToG]) \
"vld1.8 {d25}, [%[kUVToG]] \n" \
MEMACCESS([kUVBiasBGR]) \
@@ -107,32 +98,32 @@ extern "C" {
MEMACCESS([kYToRgb]) \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
-#define YUVTORGB \
- "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
- "vmull.u8 q9, d2, d25 \n" /* u/v G component */\
- "vmovl.u8 q0, d0 \n" /* Y */\
- "vmovl.s16 q10, d1 \n" \
- "vmovl.s16 q0, d0 \n" \
- "vmul.s32 q10, q10, q15 \n" \
- "vmul.s32 q0, q0, q15 \n" \
- "vqshrun.s32 d0, q0, #16 \n" \
- "vqshrun.s32 d1, q10, #16 \n" /* Y */\
- "vadd.s16 d18, d19 \n" \
- "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
- "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
- "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
- "vaddw.u16 q1, q1, d16 \n" \
- "vaddw.u16 q10, q10, d17 \n" \
- "vaddw.u16 q3, q3, d18 \n" \
- "vqadd.s16 q8, q0, q13 \n" /* B */ \
- "vqadd.s16 q9, q0, q14 \n" /* R */ \
- "vqadd.s16 q0, q0, q4 \n" /* G */ \
- "vqadd.s16 q8, q8, q1 \n" /* B */ \
- "vqadd.s16 q9, q9, q10 \n" /* R */ \
- "vqsub.s16 q0, q0, q3 \n" /* G */ \
- "vqshrun.s16 d20, q8, #6 \n" /* B */ \
- "vqshrun.s16 d22, q9, #6 \n" /* R */ \
- "vqshrun.s16 d21, q0, #6 \n" /* G */
+#define YUVTORGB \
+ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
+ "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
+ "vmovl.u8 q0, d0 \n" /* Y */ \
+ "vmovl.s16 q10, d1 \n" \
+ "vmovl.s16 q0, d0 \n" \
+ "vmul.s32 q10, q10, q15 \n" \
+ "vmul.s32 q0, q0, q15 \n" \
+ "vqshrun.s32 d0, q0, #16 \n" \
+ "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
+ "vadd.s16 d18, d19 \n" \
+ "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
+ "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
+ "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
+ "vaddw.u16 q1, q1, d16 \n" \
+ "vaddw.u16 q10, q10, d17 \n" \
+ "vaddw.u16 q3, q3, d18 \n" \
+ "vqadd.s16 q8, q0, q13 \n" /* B */ \
+ "vqadd.s16 q9, q0, q14 \n" /* R */ \
+ "vqadd.s16 q0, q0, q4 \n" /* G */ \
+ "vqadd.s16 q8, q8, q1 \n" /* B */ \
+ "vqadd.s16 q9, q9, q10 \n" /* R */ \
+ "vqsub.s16 q0, q0, q3 \n" /* G */ \
+ "vqshrun.s16 d20, q8, #6 \n" /* B */ \
+ "vqshrun.s16 d22, q9, #6 \n" /* R */ \
+ "vqshrun.s16 d21, q0, #6 \n" /* G */
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@@ -227,36 +218,6 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
);
}
-void I411ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READYUV411
- YUVTORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -316,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
);
}
-#define ARGBTORGB565 \
- "vshll.u8 q0, d22, #8 \n" /* R */ \
- "vshll.u8 q8, d21, #8 \n" /* G */ \
- "vshll.u8 q9, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #5 \n" /* RG */ \
- "vsri.16 q0, q9, #11 \n" /* RGB */
+#define ARGBTORGB565 \
+ "vshll.u8 q0, d22, #8 \n" /* R */ \
+ "vshll.u8 q8, d21, #8 \n" /* G */ \
+ "vshll.u8 q9, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #5 \n" /* RG */ \
+ "vsri.16 q0, q9, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -353,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
);
}
-#define ARGBTOARGB1555 \
- "vshll.u8 q0, d23, #8 \n" /* A */ \
- "vshll.u8 q8, d22, #8 \n" /* R */ \
- "vshll.u8 q9, d21, #8 \n" /* G */ \
- "vshll.u8 q10, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #1 \n" /* AR */ \
- "vsri.16 q0, q9, #6 \n" /* ARG */ \
- "vsri.16 q0, q10, #11 \n" /* ARGB */
+#define ARGBTOARGB1555 \
+ "vshll.u8 q0, d23, #8 \n" /* A */ \
+ "vshll.u8 q8, d22, #8 \n" /* R */ \
+ "vshll.u8 q9, d21, #8 \n" /* G */ \
+ "vshll.u8 q10, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #1 \n" /* AR */ \
+ "vsri.16 q0, q9, #6 \n" /* ARG */ \
+ "vsri.16 q0, q10, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -393,14 +354,14 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
);
}
-#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
- "vzip.u8 d0, d1 \n" /* BGRA */
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -434,9 +395,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
);
}
-void I400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
@@ -459,9 +418,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
);
}
-void J400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d23, #255 \n"
"1: \n"
@@ -618,7 +575,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -640,7 +599,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width) {
asm volatile (
"1: \n"
@@ -737,7 +698,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
);
}
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
// Start at end of source row.
@@ -844,17 +807,17 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
);
}
-#define RGB565TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
- "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
- "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile (
@@ -875,34 +838,35 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
);
}
-#define ARGB1555TOARGB \
- "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
- "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
- "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
- "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
- "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
- "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
- "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
- "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
- "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
- "vorr.u8 q1, q1, q3 \n" /* R,A */ \
- "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
- "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
- "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
- "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
- "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
- "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
- "vorr.u8 d0, d0, d4 \n" /* B */ \
- "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
- "vorr.u8 d2, d1, d5 \n" /* R */ \
- "vorr.u8 d1, d4, d6 \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
@@ -922,17 +886,18 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
);
}
-#define ARGB4444TOARGB \
- "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
- "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
- "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
- "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
- "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
- "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
- "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
- "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
@@ -1021,7 +986,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
);
}
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -1042,7 +1009,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
);
}
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -1063,8 +1032,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
);
}
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
@@ -1090,8 +1062,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
);
}
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
@@ -1118,8 +1093,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
@@ -1143,7 +1120,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width) {
+ uint8* dst_yuy2,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1169,7 +1147,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width) {
+ uint8* dst_uyvy,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1210,8 +1189,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
);
}
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
asm volatile (
"vdup.32 d2, %2 \n" // dither4
"1: \n"
@@ -1233,7 +1214,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
);
}
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+ uint8* dst_argb1555,
int width) {
asm volatile (
"1: \n"
@@ -1252,7 +1234,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
);
}
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+ uint8* dst_argb4444,
int width) {
asm volatile (
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
@@ -1341,7 +1324,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
}
// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
@@ -1381,85 +1366,31 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
);
}
-// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(0)
- "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
- "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
- "vpadd.u16 d1, d8, d9 \n" // B
- "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
- "vpadd.u16 d3, d10, d11 \n" // G
- "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
- "vpadd.u16 d5, d12, d13 \n" // R
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %3, %3, #32 \n" // 32 processed per loop.
- "vmul.s16 q8, q0, q10 \n" // B
- "vmls.s16 q8, q1, q11 \n" // G
- "vmls.s16 q8, q2, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q2, q10 \n" // R
- "vmls.s16 q9, q1, q14 \n" // G
- "vmls.s16 q9, q0, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
-}
-
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB \
+ ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG \
+ ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR \
+ ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR \
+ ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG \
+ ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB \
+ ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1507,8 +1438,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
}
// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
@@ -1555,8 +1489,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
);
}
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1603,8 +1540,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
);
}
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1651,8 +1591,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
);
}
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1699,8 +1642,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
);
}
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1747,8 +1693,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
);
}
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1796,8 +1745,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1865,8 +1817,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1934,8 +1889,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -2215,8 +2173,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile (
"cmp %4, #0 \n"
@@ -2280,8 +2240,10 @@ void InterpolateRow_NEON(uint8* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"subs %3, #8 \n"
"blt 89f \n"
@@ -2371,8 +2333,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
asm volatile (
"vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
@@ -2414,7 +2379,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value) {
asm volatile (
"vdup.u32 q0, %3 \n" // duplicate scale value.
@@ -2523,8 +2490,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
@@ -2584,8 +2553,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2616,8 +2587,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBAddRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2642,8 +2615,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2672,8 +2647,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
@@ -2699,8 +2676,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
asm volatile (
// 16 pixel loop.
"1: \n"
@@ -2727,8 +2706,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
@@ -2755,8 +2736,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -2798,8 +2782,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -2835,7 +2821,63 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "q0", "q1" // Clobber List
);
}
-#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+ asm volatile (
+ "vdup.32 q0, %3 \n"
+
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, q0 \n" // adjust exponent
+ "vmul.f32 q3, q3, q0 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(1.9259299444e-34f) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// TODO(fbarchard): multiply by element.
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "vdup.32 q0, %3 \n"
+
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, q0 \n" // adjust exponent
+ "vmul.f32 q3, q3, q0 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
} // extern "C"
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index 6375d4f5..ebd685e4 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc
@@ -19,28 +19,18 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
+#define READYUV422 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v1.s}[0], [%1], #4 \n" \
MEMACCESS(2) \
"ld1 {v1.s}[1], [%2], #4 \n"
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- MEMACCESS(1) \
- "ld1 {v2.h}[0], [%1], #2 \n" \
- MEMACCESS(2) \
- "ld1 {v2.h}[1], [%2], #2 \n" \
- "zip1 v1.8b, v2.8b, v2.8b \n"
-
// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
+#define READYUV444 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v1.d}[0], [%1], #8 \n" \
MEMACCESS(2) \
@@ -49,15 +39,15 @@ extern "C" {
"rshrn v1.8b, v1.8h, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "movi v1.8b , #128 \n"
+#define READYUV400 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "movi v1.8b , #128 \n"
// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
+#define READNV12 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
@@ -65,9 +55,9 @@ extern "C" {
"ins v1.s[1], v3.s[0] \n"
// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
+#define READNV21 \
+ MEMACCESS(0) \
+ "ld1 {v0.8b}, [%0], #8 \n" \
MEMACCESS(1) \
"ld1 {v2.8b}, [%1], #8 \n" \
"uzp1 v3.8b, v2.8b, v2.8b \n" \
@@ -75,57 +65,65 @@ extern "C" {
"ins v1.s[1], v3.s[0] \n"
// Read 8 YUY2
-#define READYUY2 \
- MEMACCESS(0) \
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
- "uzp2 v3.8b, v1.8b, v1.8b \n" \
- "uzp1 v1.8b, v1.8b, v1.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
+#define READYUY2 \
+ MEMACCESS(0) \
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
+ "uzp2 v3.8b, v1.8b, v1.8b \n" \
+ "uzp1 v1.8b, v1.8b, v1.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 UYVY
-#define READUYVY \
- MEMACCESS(0) \
- "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
- "orr v0.8b, v3.8b, v3.8b \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-#define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
-
-#define YUVTORGB(vR, vG, vB) \
- "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
- "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
- "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
- "ushll v0.4s, v0.4h, #0 \n" \
- "mul v3.4s, v3.4s, v31.4s \n" \
- "mul v0.4s, v0.4s, v31.4s \n" \
- "sqshrun v0.4h, v0.4s, #16 \n" \
- "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
- "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
- "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
- "uxtl v2.8h, v2.8b \n" \
- "uxtl v1.8h, v1.8b \n" /* Extract U */ \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
- "sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
- "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
- "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
- "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
- "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
- "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
- "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
- "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
- "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
+#define READUYVY \
+ MEMACCESS(0) \
+ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
+ "orr v0.8b, v3.8b, v3.8b \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+#define YUVTORGB_SETUP \
+ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
+ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+#define YUVTORGB(vR, vG, vB) \
+ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
+ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
+ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
+ "ushll v0.4s, v0.4h, #0 \n" \
+ "mul v3.4s, v3.4s, v31.4s \n" \
+ "mul v0.4s, v0.4s, v31.4s \n" \
+ "sqshrun v0.4h, v0.4s, #16 \n" \
+ "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
+ "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
+ "uxtl v2.8h, v2.8b \n" \
+ "uxtl v1.8h, v1.8b \n" /* Extract U */ \
+ "mul v3.8h, v1.8h, v27.8h \n" \
+ "mul v5.8h, v1.8h, v29.8h \n" \
+ "mul v6.8h, v2.8h, v30.8h \n" \
+ "mul v7.8h, v2.8h, v28.8h \n" \
+ "sqadd v6.8h, v6.8h, v5.8h \n" \
+ "sqadd " #vB \
+ ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG \
+ ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR \
+ ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB \
+ ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG \
+ ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR \
+ ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB \
+ ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG \
+ ".8h, #6 \n" /* G */ \
+ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@@ -220,36 +218,6 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
);
}
-void I411ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
- READYUV411
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- MEMACCESS(3)
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
-}
-
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -309,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
);
}
-#define ARGBTORGB565 \
- "shll v0.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v21.8h, #5 \n" /* RG */ \
- "sri v0.8h, v20.8h, #11 \n" /* RGB */
+#define ARGBTORGB565 \
+ "shll v0.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v21.8h, #5 \n" /* RG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -346,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
);
}
-#define ARGBTOARGB1555 \
- "shll v0.8h, v23.8b, #8 \n" /* A */ \
- "shll v22.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v22.8h, #1 \n" /* AR */ \
- "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
- "sri v0.8h, v20.8h, #11 \n" /* ARGB */
+#define ARGBTOARGB1555 \
+ "shll v0.8h, v23.8b, #8 \n" /* A */ \
+ "shll v22.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v22.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -386,15 +354,15 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
);
}
-#define ARGBTOARGB4444 \
- /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
- "ushr v20.8b, v20.8b, #4 \n" /* B */ \
- "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
- "ushr v22.8b, v22.8b, #4 \n" /* R */ \
- "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
- "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
- "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
- "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+#define ARGBTOARGB4444 \
+ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
+ "ushr v20.8b, v20.8b, #4 \n" /* B */ \
+ "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
+ "ushr v22.8b, v22.8b, #4 \n" /* R */ \
+ "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
+ "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
+ "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
@@ -428,9 +396,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
);
}
-void I400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
@@ -453,9 +419,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
);
}
-void J400ToARGBRow_NEON(const uint8* src_y,
- uint8* dst_argb,
- int width) {
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
asm volatile (
"movi v23.8b, #255 \n"
"1: \n"
@@ -612,7 +576,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -634,7 +600,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
int width) {
asm volatile (
"1: \n"
@@ -728,7 +696,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
);
}
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
// Start at end of source row.
@@ -834,18 +804,18 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
);
}
-#define RGB565TOARGB \
- "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
- "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
- "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
- "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
- "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
- "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
- "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
- "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
- "dup v2.2D, v0.D[1] \n" /* R */
+#define RGB565TOARGB \
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
asm volatile (
@@ -866,44 +836,45 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
);
}
-#define ARGB1555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
- "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
- "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
- \
- "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
- "xtn2 v3.16b, v2.8h \n" \
- \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
- \
- "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
- "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
- \
- "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
- "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
- "dup v1.2D, v0.D[1] \n" \
- "dup v3.2D, v2.D[1] \n"
+#define ARGB1555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB \
- "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
- "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
- "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
- \
- "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
- "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
- \
- "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
- "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
- "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
- \
- "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
- "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
- "dup v1.2D, v0.D[1] \n" /* G */ \
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+#define RGB555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+ uint8* dst_argb,
int width) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
@@ -923,19 +894,20 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
);
}
-#define ARGB4444TOARGB \
- "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
- "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
- "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
- "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
- "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
- "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
- "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
- "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
- "dup v0.2D, v2.D[1] \n" \
- "dup v1.2D, v3.D[1] \n"
+#define ARGB4444TOARGB \
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+ uint8* dst_argb,
int width) {
asm volatile (
"1: \n"
@@ -1024,7 +996,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
);
}
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -1045,7 +1019,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
);
}
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"1: \n"
@@ -1066,8 +1042,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
);
}
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
@@ -1094,8 +1073,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
);
}
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
@@ -1123,8 +1105,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // shuffler
@@ -1147,7 +1131,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_yuy2, int width) {
+ uint8* dst_yuy2,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1174,7 +1159,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_uyvy, int width) {
+ uint8* dst_uyvy,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1216,8 +1202,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
);
}
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
asm volatile (
"dup v1.4s, %w2 \n" // dither4
"1: \n"
@@ -1239,7 +1227,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
);
}
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+ uint8* dst_argb1555,
int width) {
asm volatile (
"1: \n"
@@ -1258,7 +1247,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
);
}
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+ uint8* dst_argb4444,
int width) {
asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
@@ -1346,7 +1336,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
}
// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+ uint8* dst_u,
+ uint8* dst_v,
int width) {
asm volatile (
"movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
@@ -1387,83 +1379,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
);
}
-#define RGBTOUV_SETUP_REG \
- "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
- "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
- "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
- "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
- "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
- "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
-
-// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(0)
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
- "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
- "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
- "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w3, %w3, #32 \n" // 32 processed per loop.
- "mul v3.8h, v0.8h, v20.8h \n" // B
- "mls v3.8h, v1.8h, v21.8h \n" // G
- "mls v3.8h, v2.8h, v22.8h \n" // R
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "mul v4.8h, v2.8h, v20.8h \n" // R
- "mls v4.8h, v1.8h, v24.8h \n" // G
- "mls v4.8h, v0.8h, v23.8h \n" // B
- "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v23", "v24", "v25"
- );
-}
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
- "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
- "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
- "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
- "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
- "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
- "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
- "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
- "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
- "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
- "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+#define RGBTOUV(QB, QG, QR) \
+ "mul v3.8h, " #QB \
+ ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR \
+ ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG \
+ ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG \
+ ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR \
+ ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB \
+ ",v23.8h \n" /* B */ \
+ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
+ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1503,8 +1453,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
}
// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
@@ -1547,8 +1500,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
);
}
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+ int src_stride_bgra,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1586,8 +1542,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
);
}
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+ int src_stride_abgr,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1625,8 +1584,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
);
}
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+ int src_stride_rgba,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1664,8 +1626,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
);
}
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+ int src_stride_rgb24,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1703,8 +1668,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
);
}
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8* src_raw,
+ int src_stride_raw,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1743,8 +1711,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+ int src_stride_rgb565,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile (
"movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
@@ -1817,8 +1788,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1886,8 +1860,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile (
RGBTOUV_SETUP_REG
@@ -2169,8 +2146,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
@@ -2235,8 +2214,10 @@ void InterpolateRow_NEON(uint8* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"subs %w3, %w3, #8 \n"
"b.lt 89f \n"
@@ -2331,8 +2312,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
asm volatile (
"dup v4.8h, %w2 \n"
"ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
@@ -2374,7 +2358,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
uint32 value) {
asm volatile (
"dup v0.4s, %w3 \n" // duplicate scale value.
@@ -2484,8 +2470,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
@@ -2546,8 +2534,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2578,8 +2568,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBAddRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2606,8 +2598,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
asm volatile (
// 8 pixel loop.
"1: \n"
@@ -2638,8 +2632,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
@@ -2665,8 +2661,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
asm volatile (
// 16 pixel loop.
"1: \n"
@@ -2693,8 +2691,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+void SobelXYRow_NEON(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
asm volatile (
"movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
@@ -2721,8 +2721,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -2764,8 +2767,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_NEON(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -2801,6 +2806,56 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ MEMACCESS(1)
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3"
+ );
+}
+
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ MEMACCESS(1)
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "v1", "v2", "v3"
+ );
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 2a3da896..202f2b8d 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -28,61 +28,60 @@ extern "C" {
#if defined(_M_X64)
// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8;
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
- a_buf += 8;
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
+#define YUVTORGB(yuvconstants) \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm2 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+ xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
+ xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
+ xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm0 = _mm_adds_epi16(xmm0, xmm4); \
+ xmm1 = _mm_adds_epi16(xmm1, xmm4); \
+ xmm2 = _mm_adds_epi16(xmm2, xmm4); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
-#define STOREARGB \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
- _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
- _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
- dst_argb += 32;
-
+#define STOREARGB \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
+ _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
+ _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+ dst_argb += 32;
#if defined(HAS_I422TOARGBROW_SSSE3)
void I422ToARGBRow_SSSE3(const uint8* y_buf,
@@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
#ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB.
-static const vec8 kARGBToY = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
// JPeg full range.
-static const vec8 kARGBToYJ = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
-static const vec8 kARGBToU = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
-static const vec8 kARGBToUJ = {
- 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
static const vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
-static const vec8 kARGBToVJ = {
- -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
- 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
// Constants for BGRA.
-static const vec8 kBGRAToY = {
- 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
-static const vec8 kBGRAToU = {
- 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
-static const vec8 kBGRAToV = {
- 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR.
-static const vec8 kABGRToY = {
- 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
-static const vec8 kABGRToU = {
- -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
-static const vec8 kABGRToV = {
- 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA.
-static const vec8 kRGBAToY = {
- 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
-static const vec8 kRGBAToU = {
- 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
-static const vec8 kRGBAToV = {
- 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
-static const uvec8 kAddY16 = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
- 64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static const uvec8 kAddUV128 = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-static const uvec16 kAddUVJ128 = {
- 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting RAW to RGB24. Middle 8.
static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting RAW to RGB24. Last 8.
static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RGB24.
static const uvec8 kShuffleMaskARGBToRGB24 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RAW.
static const uvec8 kShuffleMaskARGBToRAW = {
- 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
static const uvec8 kShuffleMaskARGBToRGB24_0 = {
- 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
// NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
};
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
convertloop:
@@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
#ifdef HAS_J400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
convertloop:
@@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
}
#endif // HAS_J400TOARGBROW_AVX2
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_rgb24
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
@@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
movdqu xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm4
por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqu [edx], xmm0
por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqu [edx + 16], xmm1
por xmm3, xmm5
@@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
}
}
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int width) {
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
@@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
movdqu xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm4
por xmm2, xmm5
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqu [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqu [edx], xmm0
por xmm1, xmm5
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqu [edx + 16], xmm1
por xmm3, xmm5
@@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
}
}
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
+ uint8* dst_rgb24,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_rgb24
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_rgb24
mov ecx, [esp + 12] // width
movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
@@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
@@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
movd xmm6, eax
pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
psllw xmm3, 11
- pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
psllw xmm4, 10
psrlw xmm4, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
psllw xmm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0
movdqa xmm2, xmm0
- pand xmm1, xmm3 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
- pmulhuw xmm1, xmm5 // * (256 + 8)
- pmulhuw xmm2, xmm5 // * (256 + 8)
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
psllw xmm1, 8
- por xmm1, xmm2 // RB
- pand xmm0, xmm4 // G in middle 6 bits
- pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
- por xmm0, xmm7 // AG
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
@@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
// v * 256 + v * 8
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
vmovd xmm5, eax
@@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
vmovd xmm6, eax
vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
vpsllw ymm4, ymm4, 10
vpsrlw ymm4, ymm4, 5
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_rgb565
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
- vpand ymm1, ymm0, ymm3 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
+ vpand ymm1, ymm0, ymm3 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpand ymm0, ymm0, ymm4 // G in middle 6 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
- vpor ymm0, ymm0, ymm7 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpor ymm1, ymm1, ymm2 // RB
+ vpand ymm0, ymm0, ymm4 // G in middle 6 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
+ vpor ymm0, ymm0, ymm7 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
vpermq ymm1, ymm1, 0xd8
vpunpckhbw ymm2, ymm1, ymm0
vpunpcklbw ymm1, ymm1, ymm0
@@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
#endif // HAS_RGB565TOARGBROW_AVX2
#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
vmovd xmm5, eax
@@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
vmovd xmm6, eax
vbroadcastss ymm6, xmm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
- vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
vpsllw ymm7, ymm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
- vpsllw ymm1, ymm0, 1 // R in upper 5 bits
- vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
+ vpsllw ymm1, ymm0, 1 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
vpand ymm1, ymm1, ymm3
- vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
- vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
vpsllw ymm1, ymm1, 8
- vpor ymm1, ymm1, ymm2 // RB
- vpsraw ymm2, ymm0, 8 // A
- vpand ymm0, ymm0, ymm4 // G in middle 5 bits
- vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
+ vpor ymm1, ymm1, ymm2 // RB
+ vpsraw ymm2, ymm0, 8 // A
+ vpand ymm0, ymm0, ymm4 // G in middle 5 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
vpand ymm2, ymm2, ymm7
- vpor ymm0, ymm0, ymm2 // AG
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpor ymm0, ymm0, ymm2 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
vpermq ymm1, ymm1, 0xd8
vpunpckhbw ymm2, ymm1, ymm0
vpunpcklbw ymm1, ymm1, ymm0
@@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
#endif // HAS_ARGB1555TOARGBROW_AVX2
#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
vmovd xmm4, eax
vbroadcastss ymm4, xmm4
- vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
+ vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
- vpand ymm2, ymm0, ymm5 // mask high nibbles
- vpand ymm0, ymm0, ymm4 // mask low nibbles
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
+ vpand ymm2, ymm0, ymm5 // mask high nibbles
+ vpand ymm0, ymm0, ymm4 // mask low nibbles
vpsrlw ymm3, ymm2, 4
vpsllw ymm1, ymm0, 4
vpor ymm2, ymm2, ymm3
vpor ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
vpermq ymm2, ymm2, 0xd8
vpunpckhbw ymm1, ymm0, ymm2
vpunpcklbw ymm0, ymm0, ymm2
@@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
#endif // HAS_ARGB4444TOARGBROW_AVX2
// 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
@@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
movd xmm6, eax
pshufd xmm6, xmm6, 0
- pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
psllw xmm3, 11
- movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
psrlw xmm4, 6
- pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
psllw xmm7, 8
- mov eax, [esp + 4] // src_argb1555
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0
movdqa xmm2, xmm0
- psllw xmm1, 1 // R in upper 5 bits
- psllw xmm2, 11 // B in upper 5 bits
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
pand xmm1, xmm3
- pmulhuw xmm2, xmm5 // * (256 + 8)
- pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
psllw xmm1, 8
- por xmm1, xmm2 // RB
+ por xmm1, xmm2 // RB
movdqa xmm2, xmm0
- pand xmm0, xmm4 // G in middle 5 bits
- psraw xmm2, 8 // A
- pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
pand xmm2, xmm7
- por xmm0, xmm2 // AG
+ por xmm0, xmm2 // AG
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
@@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
}
// 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
+ uint8* dst_argb,
+ int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
movd xmm4, eax
pshufd xmm4, xmm4, 0
- movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
pslld xmm5, 4
- mov eax, [esp + 4] // src_argb4444
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
sub edx, eax
convertloop:
- movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0
- pand xmm0, xmm4 // mask low nibbles
- pand xmm2, xmm5 // mask high nibbles
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
movdqa xmm1, xmm0
movdqa xmm3, xmm2
psllw xmm1, 4
@@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
}
}
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
pshufb xmm2, xmm6
pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
convertloop:
- movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
- pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
pshufb xmm2, xmm6
pshufb xmm3, xmm6
- movdqa xmm4, xmm1 // 4 bytes from 1 for 0
- psrldq xmm1, 4 // 8 bytes from 1
- pslldq xmm4, 12 // 4 bytes from 1 for 0
- movdqa xmm5, xmm2 // 8 bytes from 2 for 1
- por xmm0, xmm4 // 4 bytes from 1 for 0
- pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
movdqu [edx], xmm0 // store 0
- por xmm1, xmm5 // 8 bytes from 2 for 1
- psrldq xmm2, 8 // 4 bytes from 2
- pslldq xmm3, 4 // 12 bytes from 3 for 2
- por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqu [edx + 16], xmm1 // store 1
- movdqu [edx + 32], xmm2 // store 2
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
psrld xmm4, 26
pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
packssdw xmm0, xmm0
lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
@@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- movd xmm6, [esp + 12] // dither4
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ movd xmm6, [esp + 12] // dither4
mov ecx, [esp + 16] // width
- punpcklbw xmm6, xmm6 // make dither 16 bytes
+ punpcklbw xmm6, xmm6 // make dither 16 bytes
movdqa xmm7, xmm6
punpcklwd xmm6, xmm6
punpckhwd xmm7, xmm7
- pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
- pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
psrld xmm4, 26
pslld xmm4, 5
- pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- paddusb xmm0, xmm6 // add dither
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- pslld xmm0, 8 // R
- psrld xmm1, 3 // B
- psrld xmm2, 5 // G
- psrad xmm0, 16 // R
- pand xmm1, xmm3 // B
- pand xmm2, xmm4 // G
- pand xmm0, xmm5 // R
- por xmm1, xmm2 // BG
- por xmm0, xmm1 // BGR
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ paddusb xmm0, xmm6 // add dither
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
packssdw xmm0, xmm0
lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
@@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
}
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ const uint32 dither4,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
vbroadcastss xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
- vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
+ mov ecx, [esp + 16] // width
+ vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
vpermq ymm6, ymm6, 0xd8
vpunpcklwd ymm6, ymm6, ymm6
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
vpsrld ymm4, ymm4, 26
vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpaddusb ymm0, ymm0, ymm6 // add dither
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpaddusb ymm0, ymm0, ymm6 // add dither
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
vpackusdw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
@@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
#endif // HAS_ARGBTORGB565DITHERROW_AVX2
// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
psrld xmm4, 27
- movdqa xmm5, xmm4 // generate mask 0x000003e0
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
pslld xmm5, 5
- movdqa xmm6, xmm4 // generate mask 0x00007c00
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
pslld xmm6, 10
- pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15
convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
- movdqa xmm1, xmm0 // B
- movdqa xmm2, xmm0 // G
- movdqa xmm3, xmm0 // R
- psrad xmm0, 16 // A
- psrld xmm1, 3 // B
- psrld xmm2, 6 // G
- psrld xmm3, 9 // R
- pand xmm0, xmm7 // A
- pand xmm1, xmm4 // B
- pand xmm2, xmm5 // G
- pand xmm3, xmm6 // R
- por xmm0, xmm1 // BA
- por xmm2, xmm3 // GR
- por xmm0, xmm2 // BGRA
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
packssdw xmm0, xmm0
lea eax, [eax + 16]
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
@@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
- pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
psllw xmm4, 12
- movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8
convertloop:
- movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
- pand xmm0, xmm3 // low nibble
- pand xmm1, xmm4 // high nibble
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
psrld xmm0, 4
psrld xmm1, 8
por xmm0, xmm1
@@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
}
#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
vpsrld ymm3, ymm3, 27
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
vpsrld ymm4, ymm4, 26
vpslld ymm4, ymm4, 5
- vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm2, ymm0, 5 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrld ymm0, ymm0, 8 // R
- vpand ymm2, ymm2, ymm4 // G
- vpand ymm1, ymm1, ymm3 // B
- vpand ymm0, ymm0, ymm5 // R
- vpor ymm1, ymm1, ymm2 // BG
- vpor ymm0, ymm0, ymm1 // BGR
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
vpackusdw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
@@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
#endif // HAS_ARGBTORGB565ROW_AVX2
#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm4, ymm4, ymm4
- vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
- vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
- vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
- vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
+ vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
+ vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
+ vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
vpslld ymm7, ymm7, 15
convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpsrld ymm3, ymm0, 9 // R
- vpsrld ymm2, ymm0, 6 // G
- vpsrld ymm1, ymm0, 3 // B
- vpsrad ymm0, ymm0, 16 // A
- vpand ymm3, ymm3, ymm6 // R
- vpand ymm2, ymm2, ymm5 // G
- vpand ymm1, ymm1, ymm4 // B
- vpand ymm0, ymm0, ymm7 // A
- vpor ymm0, ymm0, ymm1 // BA
- vpor ymm2, ymm2, ymm3 // GR
- vpor ymm0, ymm0, ymm2 // BGRA
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm3, ymm0, 9 // R
+ vpsrld ymm2, ymm0, 6 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrad ymm0, ymm0, 16 // A
+ vpand ymm3, ymm3, ymm6 // R
+ vpand ymm2, ymm2, ymm5 // G
+ vpand ymm1, ymm1, ymm4 // B
+ vpand ymm0, ymm0, ymm7 // A
+ vpor ymm0, ymm0, ymm1 // BA
+ vpor ymm2, ymm2, ymm3 // GR
+ vpor ymm0, ymm0, ymm2 // BGRA
vpackssdw ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
@@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
#endif // HAS_ARGBTOARGB1555ROW_AVX2
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
+ uint8* dst_rgb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_rgb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
vpsllw ymm4, ymm4, 12
- vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
+ vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
convertloop:
- vmovdqu ymm0, [eax] // fetch 8 pixels of argb
- vpand ymm1, ymm0, ymm4 // high nibble
- vpand ymm0, ymm0, ymm3 // low nibble
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpand ymm1, ymm0, ymm4 // high nibble
+ vpand ymm0, ymm0, ymm3 // low nibble
vpsrld ymm1, ymm1, 8
vpsrld ymm0, ymm0, 4
vpor ymm0, ymm0, ymm1
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
lea eax, [eax + 32]
- vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
@@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kARGBToY
movdqa xmm5, xmmword ptr kAddY16
@@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kARGBToYJ
movdqa xmm5, xmmword ptr kAddYJ64
@@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
#ifdef HAS_ARGBTOYROW_AVX2
// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
- 0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
vbroadcastf128 ymm4, xmmword ptr kARGBToY
vbroadcastf128 ymm5, xmmword ptr kAddY16
vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
vbroadcastf128 ymm5, xmmword ptr kAddYJ64
vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
}
#endif // HAS_ARGBTOYJROW_AVX2
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kBGRAToY
movdqa xmm5, xmmword ptr kAddY16
@@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
}
}
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kABGRToY
movdqa xmm5, xmmword ptr kAddY16
@@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
}
}
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kRGBAToY
movdqa xmm5, xmmword ptr kAddY16
@@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
}
}
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
@@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
@@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUVJ128
movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
@@ -1511,8 +1500,8 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
@@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
@@ -1575,8 +1566,8 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
@@ -1642,8 +1635,8 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
// step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* convert to U and V */
- movdqu xmm0, [eax] // U
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
@@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
paddb xmm0, xmm5
movdqu [edx], xmm0
- movdqu xmm0, [eax] // V
+ movdqu xmm0, [eax] // V
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
@@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
@@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
@@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
@@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
@@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
+ int src_stride_argb,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU
- sub edi, edx // stride from u to v
+ sub edi, edx // stride from u to v
convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
@@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
+ paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
jg convertloop
@@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOYROW_SSSE3
// Read 16 UV from 444
-#define READYUV444_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
+#define READYUV444_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+#define READYUV422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+#define READYUVA422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
__asm vpermq ymm5, ymm5, 0xd8 \
- __asm lea ebp, [ebp + 16] \
- }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm { \
- __asm vmovd xmm0, dword ptr [esi] /* U */ \
- __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea ebp, [ebp + 16]}
// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+#define READNV12_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+#define READNV21_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+#define READYUY2_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 32] \
- }
+ __asm lea eax, [eax + 32]}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
+#define READUYVY_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 32] \
- }
+ __asm lea eax, [eax + 32]}
// Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm { \
+#define YUVTORGB_AVX2(YuvConstants) \
+ __asm { \
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
__asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
@@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
__asm vpsubw ymm1, ymm3, ymm1 \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 \
- /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
}
// Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+#define STOREARGB_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
__asm vmovdqu 0[edx], ymm1 \
__asm vmovdqu 32[edx], ymm0 \
- __asm lea edx, [edx + 64] \
- }
+ __asm lea edx, [edx + 64]}
// Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+#define STORERGBA_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
- __asm lea edx, [edx + 64] \
- }
+ __asm lea edx, [edx + 64]}
#ifdef HAS_I422TOARGBROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV422_AVX2
@@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I422ALPHATOARGBROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
mov edi, [esp + 16 + 12] // V
mov ebp, [esp + 16 + 16] // A
mov edx, [esp + 16 + 20] // argb
@@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I444ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV444_AVX2
YUVTORGB_AVX2(ebx)
@@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I444TOARGBROW_AVX2
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
- sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
-
- convertloop:
- READYUV411_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
-
- sub ecx, 16
- jg convertloop
-
- pop ebx
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_I411TOARGBROW_AVX2
-
#ifdef HAS_NV12TOARGBROW_AVX2
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void NV12ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READNV12_AVX2
@@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_NV21TOARGBROW_AVX2
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void NV21ToARGBRow_AVX2(
+ const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READNV21_AVX2
@@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_YUY2TOARGBROW_AVX2
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+ const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
mov ebx, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUY2_AVX2
@@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
#ifdef HAS_UYVYTOARGBROW_AVX2
// 16 pixels.
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void UYVYToARGBRow_AVX2(
+ const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
mov ebx, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READUYVY_AVX2
@@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
#ifdef HAS_I422TORGBAROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToRGBARow_AVX2(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV422_AVX2
@@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
// Allows a conversion with half size scaling.
// Read 8 UV from 444.
-#define READYUV444 __asm { \
+#define READYUV444 \
+ __asm { \
__asm movq xmm0, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
+ __asm lea eax, [eax + 8]}
// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+#define READYUV422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
+ __asm lea eax, [eax + 8]}
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+#define READYUVA422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
- __asm lea ebp, [ebp + 8] \
- }
-
-// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-// __asm pinsrw xmm0, [esi], 0 /* U */
-// __asm pinsrw xmm1, [esi + edi], 0 /* V */
-#define READYUV411_EBX __asm { \
- __asm movzx ebx, word ptr [esi] /* U */ \
- __asm movd xmm0, ebx \
- __asm movzx ebx, word ptr [esi + edi] /* V */ \
- __asm movd xmm1, ebx \
- __asm lea esi, [esi + 2] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8]}
// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm { \
+#define READNV12 \
+ __asm { \
__asm movq xmm0, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
+ __asm lea eax, [eax + 8]}
// Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm { \
+#define READNV21 \
+ __asm { \
__asm movq xmm0, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
__asm pshufb xmm0, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
+ __asm lea eax, [eax + 8]}
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
+#define READYUY2 \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
+ __asm movdqu xmm0, [eax] /* UV */ \
__asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
+#define READUYVY \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
+ __asm movdqu xmm0, [eax] /* UV */ \
__asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 16] \
- }
+ __asm lea eax, [eax + 16]}
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm { \
+#define YUVTORGB(YuvConstants) \
+ __asm { \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm movdqa xmm3, xmm0 \
@@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
__asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
__asm psubw xmm2, xmm3 \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm paddsw xmm0, xmm4 /* B += Y */ \
+ __asm paddsw xmm1, xmm4 /* G += Y */ \
+ __asm paddsw xmm2, xmm4 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
- __asm packuswb xmm2, xmm2 /* R */ \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
-#define STOREARGB __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
+#define STOREARGB \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm0 \
__asm movdqu 16[edx], xmm1 \
- __asm lea edx, [edx + 32] \
- }
+ __asm lea edx, [edx + 32]}
// Store 8 BGRA values.
-#define STOREBGRA __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
+#define STOREBGRA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32] \
- }
+ __asm lea edx, [edx + 32]}
// Store 8 RGBA values.
-#define STORERGBA __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
+#define STORERGBA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
- __asm lea edx, [edx + 32] \
- }
+ __asm lea edx, [edx + 32]}
// Store 8 RGB24 values.
-#define STORERGB24 __asm { \
- /* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+#define STORERGB24 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
- __asm lea edx, [edx + 24] \
- }
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24]}
// Store 8 RGB565 values.
-#define STORERGB565 __asm { \
- /* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+#define STORERGB565 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
- __asm lea edx, [edx + 16] \
- }
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm lea edx, [edx + 16]}
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I444ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV444
@@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
@@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb565_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb565_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
psrld xmm5, 27
- pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
psrld xmm6, 26
pslld xmm6, 5
- pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
pslld xmm7, 11
convertloop:
@@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV422
@@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ const uint8* a_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
mov edi, [esp + 16 + 12] // V
mov ebp, [esp + 16 + 16] // A
mov edx, [esp + 16 + 20] // argb
@@ -2820,62 +2744,22 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
}
// 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov edx, [esp + 16 + 16] // abgr
- mov ebp, [esp + 16 + 20] // yuvconstants
- mov ecx, [esp + 16 + 24] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
-
- convertloop:
- READYUV411_EBX
- YUVTORGB(ebp)
- STOREARGB
-
- sub ecx, 8
- jg convertloop
-
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READNV12
@@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+ const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
mov edx, [esp + 8 + 12] // argb
mov ebx, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READNV21
@@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels.
// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+ const uint8* src_yuy2,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
mov ebx, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUY2
@@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
// 8 pixels.
// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+ const uint8* src_uyvy,
+ uint8* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
mov ebx, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READUYVY
@@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
}
}
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width) {
+__declspec(naked) void I422ToRGBARow_SSSE3(
+ const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
__asm {
push esi
push edi
push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb
mov ebx, [esp + 12 + 20] // yuvconstants
@@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
__asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
movd xmm2, eax
pshufd xmm2, xmm2,0
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
movd xmm3, eax
pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
convertloop:
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
- punpcklbw xmm0, xmm0 // Y.Y
+ punpcklbw xmm0, xmm0 // Y.Y
pmulhuw xmm0, xmm2
psubusw xmm0, xmm3
psrlw xmm0, 6
- packuswb xmm0, xmm0 // G
+ packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
- punpcklbw xmm0, xmm0 // GG
+ punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0 // BGRA first 4 pixels
- punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
por xmm0, xmm4
por xmm1, xmm4
movdqu [edx], xmm0
@@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
__asm {
- mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
vmovd xmm2, eax
vbroadcastss ymm2, xmm2
- mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
vmovd xmm3, eax
vbroadcastss ymm3, xmm3
- vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
vpslld ymm4, ymm4, 24
- mov eax, [esp + 4] // Y
- mov edx, [esp + 8] // rgb
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
convertloop:
- // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+ // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
vmovdqu xmm0, [eax]
lea eax, [eax + 16]
- vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
- vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
+ vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
+ vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
vpmulhuw ymm0, ymm0, ymm2
vpsubusw ymm0, ymm0, ymm3
vpsrlw ymm0, ymm0, 6
- vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
+ vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
- vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
+ vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
- vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
- vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
+ vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
vpor ymm0, ymm0, ymm4
vpor ymm1, ymm1, ymm4
vmovdqu [edx], ymm0
@@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
- 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_SSSE3(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
movdqa xmm5, xmmword ptr kShuffleMirror
@@ -3140,11 +3022,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
@@ -3164,17 +3045,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
- 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
- int width) {
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src
- mov edx, [esp + 4 + 8] // dst_u
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
movdqa xmm1, xmmword ptr kShuffleMirrorUV
@@ -3198,11 +3079,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
@@ -3221,15 +3103,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBMIRRORROW_AVX2
// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
@@ -3246,16 +3127,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -3265,10 +3147,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
lea eax, [eax + 32]
movdqa xmm2, xmm0
movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
+ pand xmm0, xmm5 // even bytes
pand xmm1, xmm5
packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
+ psrlw xmm2, 8 // odd bytes
psrlw xmm3, 8
packuswb xmm2, xmm3
movdqu [edx], xmm0
@@ -3285,16 +3167,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3302,9 +3185,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
- vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
@@ -3324,24 +3207,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
+__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
sub edx, eax
convertloop:
- movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqu [edi], xmm0
movdqu [edi + 16], xmm2
lea edi, [edi + 32]
@@ -3355,24 +3239,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) {
+__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uv,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
sub edx, eax
convertloop:
- vmovdqu ymm0, [eax] // read 32 U's
- vmovdqu ymm1, [eax + edx] // and 32 V's
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
lea eax, [eax + 32]
- vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
- vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
- vextractf128 [edi], ymm2, 0 // bytes 0..15
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vextractf128 [edi], ymm2, 0 // bytes 0..15
vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
@@ -3389,11 +3274,10 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
test eax, 15
jne convertloopu
@@ -3427,11 +3311,10 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
#ifdef HAS_COPYROW_AVX
// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
convertloop:
@@ -3451,13 +3334,12 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_AVX
// Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
- mov esi, [esp + 4] // src
- mov edi, [esp + 8] // dst
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
rep movsb
mov edi, edx
@@ -3468,15 +3350,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8
convertloop:
@@ -3504,14 +3387,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
convertloop:
vmovdqu ymm1, [eax]
@@ -3533,11 +3417,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
// width in pixels
-__declspec(naked)
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
+ uint8* dst_a,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_a
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
mov ecx, [esp + 12] // width
extractloop:
@@ -3558,17 +3443,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
}
#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
+ uint8* dst_a,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+ vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
+
+ extractloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpsrld ymm0, ymm0, 24
+ vpsrld ymm1, ymm1, 24
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ lea eax, [eax + 128]
+ vpackssdw ymm0, ymm0, ymm1 // mutates
+ vpsrld ymm2, ymm2, 24
+ vpsrld ymm3, ymm3, 24
+ vpackssdw ymm2, ymm2, ymm3 // mutates
+ vpackuswb ymm0, ymm0, ymm2 // mutates
+ vpermd ymm0, ymm4, ymm0 // unmutate
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg extractloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
- pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8
convertloop:
@@ -3598,14 +3520,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
+ uint8* dst,
+ int width) {
__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
vpcmpeqb ymm0, ymm0, ymm0
- vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
convertloop:
vpmovzxbd ymm1, qword ptr [eax]
@@ -3630,14 +3553,13 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_SETROW_X86
// Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
+__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm {
- movzx eax, byte ptr [esp + 8] // v8
+ movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
- mul edx // overwrites edx with upper part of result.
+ mul edx // overwrites edx with upper part of result.
mov edx, edi
- mov edi, [esp + 4] // dst
+ mov edi, [esp + 4] // dst
mov ecx, [esp + 12] // count
shr ecx, 2
rep stosd
@@ -3647,12 +3569,11 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
}
// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm {
mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v8
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v8
mov ecx, [esp + 12] // count
rep stosb
mov edi, edx
@@ -3661,12 +3582,11 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
}
// Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm {
mov edx, edi
- mov edi, [esp + 4] // dst
- mov eax, [esp + 8] // v32
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
rep stosd
mov edi, edx
@@ -3676,12 +3596,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
@@ -3689,9 +3610,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -3702,18 +3623,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
}
}
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3723,18 +3646,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
+ vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
+ vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -3746,16 +3669,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
}
}
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3763,18 +3687,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
+ vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
+ vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -3785,21 +3709,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
vpsrlw ymm1, ymm1, 8
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -3810,18 +3734,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
}
}
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3831,18 +3757,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
+ vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
+ vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -3854,16 +3780,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
}
}
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3871,18 +3798,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
vpand ymm1, ymm1, ymm5
- vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
- vpsrlw ymm0, ymm0, 8 // V
+ vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
- vextractf128 [edx + edi], ymm0, 0 // V
+ vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
@@ -3895,21 +3822,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int width) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+ uint8* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
+ pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -3920,18 +3847,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -3943,13 +3872,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
+ psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
@@ -3963,16 +3892,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
}
}
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -3980,13 +3910,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm0, 8 // YUYV -> UVUV
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
+ psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
@@ -3999,19 +3929,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
+ uint8* dst_y,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -4022,18 +3952,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
}
}
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+ int stride_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -4045,13 +3977,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
+ psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
@@ -4065,16 +3997,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
}
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u,
+ uint8* dst_v,
+ int width) {
__asm {
push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -4082,13 +4015,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm5
packuswb xmm0, xmm1
movdqa xmm1, xmm0
pand xmm0, xmm5 // U
packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
+ psrlw xmm1, 8 // V
packuswb xmm1, xmm1
movq qword ptr [edx], xmm0
movq qword ptr [edx + edi], xmm1
@@ -4108,13 +4041,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
__asm {
push esi
push edi
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
mov eax, 0x80808080 // 128 for biasing image to signed.
movd xmm6, eax
@@ -4123,8 +4058,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
mov eax, 0x807f807f // 32768 + 127 for unbias and round.
movd xmm7, eax
pshufd xmm7, xmm7, 0x00
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
mov esi, [esp + 8 + 12] // alpha
mov edi, [esp + 8 + 16] // dst
mov ecx, [esp + 8 + 20] // width
@@ -4134,15 +4069,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
// 8 pixel loop.
convertloop8:
- movq xmm0, qword ptr [esi] // alpha
+ movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
- pxor xmm0, xmm5 // a, 255-a
+ pxor xmm0, xmm5 // a, 255-a
movq xmm1, qword ptr [eax + esi] // src0
movq xmm2, qword ptr [edx + esi] // src1
punpcklbw xmm1, xmm2
- psubb xmm1, xmm6 // bias src0/1 - 128
+ psubb xmm1, xmm6 // bias src0/1 - 128
pmaddubsw xmm0, xmm1
- paddw xmm0, xmm7 // unbias result - 32768 and round.
+ paddw xmm0, xmm7 // unbias result - 32768 and round.
psrlw xmm0, 8
packuswb xmm0, xmm0
movq qword ptr [edi + esi], xmm0
@@ -4163,13 +4098,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
+ const uint8* src1,
+ const uint8* alpha,
+ uint8* dst,
+ int width) {
__asm {
push esi
push edi
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
vpsllw ymm5, ymm5, 8
mov eax, 0x80808080 // 128 for biasing image to signed.
vmovd xmm6, eax
@@ -4177,8 +4114,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
mov eax, 0x807f807f // 32768 + 127 for unbias and round.
vmovd xmm7, eax
vbroadcastss ymm7, xmm7
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
mov esi, [esp + 8 + 12] // alpha
mov edi, [esp + 8 + 16] // dst
mov ecx, [esp + 8 + 20] // width
@@ -4188,21 +4125,21 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
// 32 pixel loop.
convertloop32:
- vmovdqu ymm0, [esi] // alpha
- vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
- vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
- vpxor ymm3, ymm3, ymm5 // a, 255-a
- vpxor ymm0, ymm0, ymm5 // a, 255-a
+ vmovdqu ymm0, [esi] // alpha
+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
+ vpxor ymm3, ymm3, ymm5 // a, 255-a
+ vpxor ymm0, ymm0, ymm5 // a, 255-a
vmovdqu ymm1, [eax + esi] // src0
vmovdqu ymm2, [edx + esi] // src1
vpunpckhbw ymm4, ymm1, ymm2
vpunpcklbw ymm1, ymm1, ymm2
- vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
- vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
+ vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
vpmaddubsw ymm3, ymm3, ymm4
vpmaddubsw ymm0, ymm0, ymm1
- vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
- vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
+ vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
vpsrlw ymm3, ymm3, 8
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm3
@@ -4221,52 +4158,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
- 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 0x0001
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
psrlw xmm7, 15
- pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8
- pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
- pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
sub ecx, 4
- jl convertloop4b // less than 4 pixels?
+ jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
convertloop4:
- movdqu xmm3, [eax] // src argb
+ movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movdqu xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movdqu xmm1, [esi] // _a_g
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
lea esi, [esi + 16]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4278,24 +4214,24 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop.
convertloop1:
- movd xmm3, [eax] // src argb
+ movd xmm3, [eax] // src argb
lea eax, [eax + 4]
- movdqa xmm0, xmm3 // src argb
- pxor xmm3, xmm4 // ~alpha
- movd xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
- pand xmm2, xmm6 // _r_b
- paddw xmm3, xmm7 // 256 - alpha
- pmullw xmm2, xmm3 // _r_b * alpha
- movd xmm1, [esi] // _a_g
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
lea esi, [esi + 4]
- psrlw xmm1, 8 // _a_g
- por xmm0, xmm4 // set alpha to 255
- pmullw xmm1, xmm3 // _a_g * alpha
- psrlw xmm2, 8 // _r_b convert to 8 bits again
- paddusb xmm0, xmm2 // + src argb
- pand xmm1, xmm5 // a_g_ convert to 8 bits again
- paddusb xmm0, xmm1 // + src argb
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@@ -4311,41 +4247,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {
- 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
};
static const uvec8 kShuffleAlpha1 = {
- 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24
movdqa xmm4, xmmword ptr kShuffleAlpha0
movdqa xmm5, xmmword ptr kShuffleAlpha1
convertloop:
- movdqu xmm0, [eax] // read 4 pixels
- pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqu xmm1, [eax] // read 4 pixels
- punpcklbw xmm1, xmm1 // first 2 pixel rgbs
- pmulhuw xmm0, xmm1 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
- pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqu xmm2, [eax] // read 4 pixels
- punpckhbw xmm2, xmm2 // next 2 pixel rgbs
- pmulhuw xmm1, xmm2 // rgb * a
- movdqu xmm2, [eax] // mask original alpha
+ movdqu xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqu xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqu xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqu xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqu xmm2, [eax] // mask original alpha
lea eax, [eax + 16]
pand xmm2, xmm3
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
- por xmm0, xmm2 // copy original alpha
+ por xmm0, xmm2 // copy original alpha
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4358,22 +4295,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
- 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
+ vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpshufb ymm2, ymm0, ymm4 // low 4 alphas
@@ -4398,40 +4336,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
__asm {
push ebx
push esi
push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
mov ecx, [esp + 12 + 12] // width
lea ebx, fixed_invtbl8
convertloop:
- movdqu xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha
- punpcklbw xmm0, xmm0 // first 2
+ punpcklbw xmm0, xmm0 // first 2
movd xmm2, dword ptr [ebx + esi * 4]
movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
- pmulhuw xmm0, xmm2 // rgb * a
+ pmulhuw xmm0, xmm2 // rgb * a
- movdqu xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha
- punpckhbw xmm1, xmm1 // next 2
+ punpckhbw xmm1, xmm1 // next 2
movd xmm2, dword ptr [ebx + esi * 4]
movd xmm3, dword ptr [ebx + edi * 4]
- pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
- pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
- pmulhuw xmm1, xmm2 // rgb * a
+ pmulhuw xmm1, xmm2 // rgb * a
lea eax, [eax + 16]
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -4450,25 +4388,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
- 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction.
#ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
convertloop:
- vmovdqu ymm6, [eax] // read 8 pixels.
+ vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
- vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
@@ -4488,50 +4425,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
ret
}
}
-#else // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width) {
+#else // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
__asm {
push ebx
push esi
push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
mov ecx, [esp + 12 + 12] // width
sub edx, eax
lea ebx, fixed_invtbl8
vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
convertloop:
- // replace VPGATHER
- movzx esi, byte ptr [eax + 3] // alpha0
- movzx edi, byte ptr [eax + 7] // alpha1
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
- movzx esi, byte ptr [eax + 11] // alpha2
- movzx edi, byte ptr [eax + 15] // alpha3
- vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
- movzx esi, byte ptr [eax + 19] // alpha4
- movzx edi, byte ptr [eax + 23] // alpha5
- vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
- movzx esi, byte ptr [eax + 27] // alpha6
- movzx edi, byte ptr [eax + 31] // alpha7
- vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
- vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
- vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
- vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
- vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
// end of VPGATHER
- vmovdqu ymm6, [eax] // read 8 pixels.
+ vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
@@ -4540,7 +4477,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
- vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
sub ecx, 8
@@ -4558,12 +4495,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width) {
__asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* width */
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
movdqa xmm4, xmmword ptr kARGBToYJ
movdqa xmm5, xmmword ptr kAddYJ64
@@ -4575,20 +4513,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
phaddw xmm0, xmm1
paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 G bytes
+ packuswb xmm0, xmm0 // 8 G bytes
movdqu xmm2, [eax] // A
movdqu xmm3, [eax + 16]
lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
packuswb xmm2, xmm3
- packuswb xmm2, xmm2 // 8 A bytes
- movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
- punpcklbw xmm0, xmm0 // 8 GG words
- punpcklbw xmm3, xmm2 // 8 GA words
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
movdqa xmm1, xmm0
- punpcklwd xmm0, xmm3 // GGGA first 4
- punpckhwd xmm1, xmm3 // GGGA next 4
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
@@ -4604,24 +4542,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
// Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
- 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
-static const vec8 kARGBToSepiaG = {
- 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
-static const vec8 kARGBToSepiaR = {
- 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
__asm {
- mov eax, [esp + 4] /* dst_argb */
- mov ecx, [esp + 8] /* width */
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
movdqa xmm2, xmmword ptr kARGBToSepiaB
movdqa xmm3, xmmword ptr kARGBToSepiaG
movdqa xmm4, xmmword ptr kARGBToSepiaR
@@ -4633,32 +4567,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
- packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm0, xmm0 // 8 B values
movdqu xmm5, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
movdqu xmm5, [eax] // R
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
+ packuswb xmm5, xmm5 // 8 R values
movdqu xmm6, [eax] // A
movdqu xmm1, [eax + 16]
psrld xmm6, 24
psrld xmm1, 24
packuswb xmm6, xmm1
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm5, xmm6 // 8 RA values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
movdqu [eax], xmm0
movdqu [eax + 16], xmm1
lea eax, [eax + 32]
@@ -4674,19 +4608,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* matrix_argb */
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const int8* matrix_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
movdqu xmm5, [ecx]
pshufd xmm2, xmm5, 0x00
pshufd xmm3, xmm5, 0x55
pshufd xmm4, xmm5, 0xaa
pshufd xmm5, xmm5, 0xff
- mov ecx, [esp + 16] /* width */
+ mov ecx, [esp + 16] /* width */
convertloop:
movdqu xmm0, [eax] // B
@@ -4697,31 +4632,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm7 // B
- phaddsw xmm6, xmm1 // G
- psraw xmm0, 6 // B
- psraw xmm6, 6 // G
- packuswb xmm0, xmm0 // 8 B values
- packuswb xmm6, xmm6 // 8 G values
- punpcklbw xmm0, xmm6 // 8 BG values
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
movdqu xmm1, [eax] // R
movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
- phaddsw xmm1, xmm7 // R
+ phaddsw xmm1, xmm7 // R
movdqu xmm6, [eax] // A
movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
- phaddsw xmm6, xmm7 // A
- psraw xmm1, 6 // R
- psraw xmm6, 6 // A
- packuswb xmm1, xmm1 // 8 R values
- packuswb xmm6, xmm6 // 8 A values
- punpcklbw xmm1, xmm6 // 8 RA values
- movdqa xmm6, xmm0 // Weave BG, RA together
- punpcklwd xmm0, xmm1 // BGRA first 4
- punpckhwd xmm6, xmm1 // BGRA next 4
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
movdqu [edx], xmm0
movdqu [edx + 16], xmm6
lea eax, [eax + 32]
@@ -4735,15 +4670,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
- int interval_offset, int width) {
- __asm {
- mov eax, [esp + 4] /* dst_argb */
- movd xmm2, [esp + 8] /* scale */
- movd xmm3, [esp + 12] /* interval_size */
- movd xmm4, [esp + 16] /* interval_offset */
- mov ecx, [esp + 20] /* width */
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
pshuflw xmm2, xmm2, 040h
pshufd xmm2, xmm2, 044h
pshuflw xmm3, xmm3, 040h
@@ -4756,16 +4693,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
convertloop:
movdqu xmm0, [eax] // read 4 pixels
- punpcklbw xmm0, xmm5 // first 2 pixels
- pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
movdqu xmm1, [eax] // read 4 pixels
- punpckhbw xmm1, xmm5 // next 2 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
- pmullw xmm0, xmm3 // * interval_size
+ pmullw xmm0, xmm3 // * interval_size
movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
- pand xmm7, xmm6 // mask alpha
- paddw xmm0, xmm4 // + interval_size / 2
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
@@ -4780,25 +4717,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) {
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ uint32 value) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value
punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2
convertloop:
- movdqu xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- pmulhuw xmm0, xmm2 // argb * value
- pmulhuw xmm1, xmm2 // argb * value
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
@@ -4814,28 +4752,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm2, [esi] // read 4 pixels from src_argb1
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@@ -4853,13 +4792,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4867,11 +4807,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
jl convertloop49
convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4882,11 +4822,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
jl convertloop19
convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
+ movd xmm0, [eax] // read 1 pixels from src_argb0
lea eax, [eax + 4]
- movd xmm1, [esi] // read 1 pixels from src_argb1
+ movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@@ -4901,22 +4841,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16]
- movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4930,28 +4871,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
+ vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
- vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
- vpunpcklbw ymm0, ymm1, ymm1 // low 4
- vpunpckhbw ymm1, ymm1, ymm1 // high 4
- vpunpcklbw ymm2, ymm3, ymm5 // low 4
- vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -4967,20 +4909,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
- vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -4996,20 +4939,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+ const uint8* src_argb1,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
- mov esi, [esp + 4 + 8] // src_argb1
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -5028,14 +4972,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
// -1 0 1
// -2 0 2
// -1 0 1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ const uint8* src_y2,
+ uint8* dst_sobelx,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y0
- mov esi, [esp + 8 + 8] // src_y1
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
mov edi, [esp + 8 + 12] // src_y2
mov edx, [esp + 8 + 16] // dst_sobelx
mov ecx, [esp + 8 + 20] // width
@@ -5045,17 +4991,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
pxor xmm5, xmm5 // constant 0
convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@@ -5063,7 +5009,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
paddw xmm0, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
@@ -5084,13 +5030,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1
// 0 0 0
// 1 2 1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
+ const uint8* src_y1,
+ uint8* dst_sobely,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_y0
- mov esi, [esp + 4 + 8] // src_y1
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
mov edx, [esp + 4 + 12] // dst_sobely
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@@ -5098,17 +5045,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
pxor xmm5, xmm5 // constant 0
convertloop:
- movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
- movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
- movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
- movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@@ -5116,7 +5063,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
paddw xmm0, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm1
- pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
psubw xmm1, xmm0
pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
@@ -5137,36 +5084,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// R = Sobel
// G = Sobel
// B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
- pslld xmm5, 24 // 0xff000000
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
- movdqa xmm2, xmm0 // GG
- punpcklbw xmm2, xmm0 // First 8
- punpckhbw xmm0, xmm0 // Next 8
- movdqa xmm1, xmm2 // GGGG
- punpcklwd xmm1, xmm2 // First 4
- punpckhwd xmm2, xmm2 // Next 4
- por xmm1, xmm5 // GGGA
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
por xmm2, xmm5
- movdqa xmm3, xmm0 // GGGG
- punpcklwd xmm3, xmm0 // Next 4
- punpckhwd xmm0, xmm0 // Last 4
- por xmm3, xmm5 // GGGA
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
por xmm0, xmm5
movdqu [edx], xmm1
movdqu [edx + 16], xmm2
@@ -5184,22 +5132,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_y, int width) {
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_y,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
- paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -5217,36 +5166,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_sobelx
- mov esi, [esp + 4 + 8] // src_sobely
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
+ pcmpeqb xmm5, xmm5 // alpha 255
convertloop:
- movdqu xmm0, [eax] // read 16 pixels src_sobelx
- movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
- paddusb xmm2, xmm1 // sobel = sobelx + sobely
- movdqa xmm3, xmm0 // XA
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
punpcklbw xmm3, xmm5
punpckhbw xmm0, xmm5
- movdqa xmm4, xmm1 // YS
+ movdqa xmm4, xmm1 // YS
punpcklbw xmm4, xmm2
punpckhbw xmm1, xmm2
- movdqa xmm6, xmm4 // YSXA
- punpcklwd xmm6, xmm3 // First 4
- punpckhwd xmm4, xmm3 // Next 4
- movdqa xmm7, xmm1 // YSXA
- punpcklwd xmm7, xmm0 // Next 4
- punpckhwd xmm1, xmm0 // Last 4
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
movdqu [edx], xmm6
movdqu [edx + 16], xmm4
movdqu [edx + 32], xmm7
@@ -5275,8 +5225,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// count is number of averaged pixels to produce.
// Does 4 pixels at a time.
// This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+ const int32* botleft,
+ int width,
+ int area,
+ uint8* dst,
int count) {
__asm {
mov eax, topleft // eax topleft
@@ -5294,18 +5247,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
cmp area, 128 // 128 pixels will not overflow 15 bits.
ja l4
- pshufd xmm5, xmm5, 0 // area
- pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
psrld xmm6, 16
cvtdq2ps xmm6, xmm6
- addps xmm5, xmm6 // (65536.0 + area - 1)
- mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
- cvtps2dq xmm5, xmm5 // 0.16 fixed point
- packssdw xmm5, xmm5 // 16 bit shorts
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
// 4 pixel loop small blocks.
s4:
- // top left
+ // top left
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
@@ -5347,7 +5300,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel loop
l4:
- // top left
+ // top left
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
@@ -5373,7 +5326,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
paddd xmm3, [esi + edx * 4 + 48]
lea esi, [esi + 64]
- cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
cvtdq2ps xmm1, xmm1
mulps xmm0, xmm4
mulps xmm1, xmm4
@@ -5422,8 +5375,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+ int32* cumsum,
+ const int32* previous_cumsum,
+ int width) {
__asm {
mov eax, row
mov edx, cumsum
@@ -5505,10 +5460,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
- uint8* dst_argb, const float* uv_dudv, int width) {
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
+ int src_argb_stride,
+ uint8* dst_argb,
+ const float* uv_dudv,
+ int width) {
__asm {
push esi
push edi
@@ -5519,7 +5475,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 28] // width
- shl esi, 16 // 4, stride
+ shl esi, 16 // 4, stride
add esi, 4
movd xmm5, esi
sub ecx, 4
@@ -5528,37 +5484,37 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
// setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
- movdqa xmm0, xmm2 // x0, y0, x1, y1
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
addps xmm0, xmm7
movlhps xmm2, xmm0
movdqa xmm4, xmm7
- addps xmm4, xmm4 // dudv *= 2
- movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
addps xmm3, xmm4
- addps xmm4, xmm4 // dudv *= 4
+ addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
l4:
- cvttps2dq xmm0, xmm2 // x, y float to int first 2
- cvttps2dq xmm1, xmm3 // x, y float to int next 2
- packssdw xmm0, xmm1 // x, y as 8 shorts
- pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1
- punpckldq xmm1, xmm6 // combine pixel 0 and 1
- addps xmm2, xmm4 // x, y += dx, dy first 2
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
movq qword ptr [edx], xmm1
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
movd xmm6, [eax + esi] // read pixel 2
movd xmm0, [eax + edi] // read pixel 3
- punpckldq xmm6, xmm0 // combine pixel 2 and 3
- addps xmm3, xmm4 // x, y += dx, dy next 2
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
movq qword ptr 8[edx], xmm6
lea edx, [edx + 16]
sub ecx, 4
@@ -5570,10 +5526,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
// 1 pixel loop
l1:
- cvttps2dq xmm0, xmm2 // x, y float to int
- packssdw xmm0, xmm0 // x, y as shorts
- pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
- addps xmm2, xmm7 // x, y += dx, dy
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel
movd [edx], xmm0
@@ -5590,15 +5546,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
@@ -5607,7 +5564,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
je xloop100 // 0 / 256. Blend 100 / 0.
sub edi, esi
cmp eax, 128
- je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
vmovd xmm0, eax // high fraction 0..255
neg eax
@@ -5634,14 +5591,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 8
vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm1 // unmutates
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop
jmp xloop99
- // Blend 50 / 50.
+ // Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
@@ -5651,7 +5608,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jg xloop50
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
+ // Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@@ -5666,16 +5623,17 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
// Bilinear filter 16x2 -> 16x1
// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
__asm {
push esi
push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
@@ -5684,7 +5642,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
movd xmm0, eax // high fraction 0..255
neg eax
@@ -5703,7 +5661,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- psubb xmm0, xmm4 // bias image by -128
+ psubb xmm0, xmm4 // bias image by -128
psubb xmm1, xmm4
movdqa xmm2, xmm5
movdqa xmm3, xmm5
@@ -5747,15 +5705,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
movdqu xmm5, [ecx]
- mov ecx, [esp + 16] // width
+ mov ecx, [esp + 16] // width
wloop:
movdqu xmm0, [eax]
@@ -5773,15 +5732,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
- mov ecx, [esp + 16] // width
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
+ mov ecx, [esp + 16] // width
wloop:
vmovdqu ymm0, [eax]
@@ -5801,19 +5761,20 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const uint8* shuffler,
+ int width) {
__asm {
push ebx
push esi
- mov eax, [esp + 8 + 4] // src_argb
- mov edx, [esp + 8 + 8] // dst_argb
- mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // width
+ mov eax, [esp + 8 + 4] // src_argb
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov esi, [esp + 8 + 12] // shuffler
+ mov ecx, [esp + 8 + 16] // width
pxor xmm5, xmm5
- mov ebx, [esi] // shuffler
+ mov ebx, [esi] // shuffler
cmp ebx, 0x03000102
je shuf_3012
cmp ebx, 0x00010203
@@ -5823,7 +5784,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
cmp ebx, 0x02010003
je shuf_2103
- // TODO(fbarchard): Use one source pointer and 3 offsets.
+ // TODO(fbarchard): Use one source pointer and 3 offsets.
shuf_any1:
movzx ebx, byte ptr [esi]
movzx ebx, byte ptr [eax + ebx]
@@ -5849,7 +5810,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm5
punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
+ pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
pshuflw xmm0, xmm0, 01Bh
pshufhw xmm1, xmm1, 01Bh
pshuflw xmm1, xmm1, 01Bh
@@ -5866,7 +5827,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm5
punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
+ pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
pshuflw xmm0, xmm0, 039h
pshufhw xmm1, xmm1, 039h
pshuflw xmm1, xmm1, 039h
@@ -5883,7 +5844,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm5
punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
+ pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
pshuflw xmm0, xmm0, 093h
pshufhw xmm1, xmm1, 093h
pshuflw xmm1, xmm1, 093h
@@ -5900,7 +5861,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movdqa xmm1, xmm0
punpcklbw xmm0, xmm5
punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
+ pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
pshuflw xmm0, xmm0, 0C6h
pshufhw xmm1, xmm1, 0C6h
pshuflw xmm1, xmm1, 0C6h
@@ -5923,30 +5884,30 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
// UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
lea eax, [eax + 16]
movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2 // YUYV
+ punpcklbw xmm0, xmm2 // YUYV
punpckhbw xmm1, xmm2
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
@@ -5960,30 +5921,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
}
}
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame, int width) {
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame,
+ int width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_y
- mov esi, [esp + 8 + 8] // src_u
- mov edx, [esp + 8 + 12] // src_v
- mov edi, [esp + 8 + 16] // dst_frame
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
- movq xmm2, qword ptr [esi] // U
- movq xmm3, qword ptr [esi + edx] // V
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
- punpcklbw xmm2, xmm3 // UV
- movdqu xmm0, [eax] // Y
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
movdqa xmm1, xmm2
lea eax, [eax + 16]
- punpcklbw xmm1, xmm0 // UYVY
+ punpcklbw xmm1, xmm0 // UYVY
punpckhbw xmm2, xmm0
movdqu [edi], xmm1
movdqu [edi + 16], xmm2
@@ -5998,22 +5959,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
}
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] /* src_argb */
- mov edx, [esp + 4 + 8] /* dst_argb */
- mov esi, [esp + 4 + 12] /* poly */
- mov ecx, [esp + 4 + 16] /* width */
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop.
convertloop:
-// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
-// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+ // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 8]
punpcklbw xmm0, xmm3
@@ -6057,25 +6018,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb, const float* poly,
- int width) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_argb */
- mov ecx, [esp + 12] /* poly */
- vbroadcastf128 ymm4, [ecx] // C0
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb,
+ const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1
vbroadcastf128 ymm6, [ecx + 32] // C2
vbroadcastf128 ymm7, [ecx + 48] // C3
- mov ecx, [esp + 16] /* width */
+ mov ecx, [esp + 16] /* width */
// 2 pixel loop.
convertloop:
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
lea eax, [eax + 8]
- vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
vmulps ymm2, ymm0, ymm0 // X * X
vmulps ymm3, ymm0, ymm7 // C3 * X
vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
@@ -6095,16 +6056,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ mulss xmm4, kExpBias
+ pshufd xmm4, xmm4, 0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ // 8 pixel loop.
+ convertloop:
+ movdqu xmm2, xmmword ptr [eax] // 8 shorts
+ add eax, 16
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm5
+ cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
+ punpckhwd xmm3, xmm5
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ psrld xmm2, 13
+ psrld xmm3, 13
+ packssdw xmm2, xmm3
+ movdqu [eax + edx - 16], xmm2
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+
+ vmulss xmm4, xmm4, kExpBias
+ vbroadcastss ymm4, xmm4
+ vpxor ymm5, ymm5, ymm5
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vmovdqu ymm2, [eax] // 16 shorts
+ add eax, 32
+ vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
+ vpunpcklwd ymm2, ymm2, ymm5
+ vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
+ vcvtdq2ps ymm2, ymm2
+ vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
+ vmulps ymm2, ymm2, ymm4
+ vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
+ vpsrld ymm2, ymm2, 13
+ vpackssdw ymm2, ymm2, ymm3
+ vmovdqu [eax + edx - 32], ymm2
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16* src,
+ uint16* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ vbroadcastss ymm4, [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
+ vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
+ add eax, 32
+ vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
+ vcvtdq2ps ymm3, ymm3
+ vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
+ vmulps ymm3, ymm3, ymm4
+ vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
+ vcvtps2ph xmm3, ymm3, 3
+ vmovdqu [eax + edx + 32], xmm2
+ vmovdqu [eax + edx + 32 + 16], xmm3
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_F16C
+
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
+__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
convertloop:
@@ -6131,13 +6201,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
+ const uint8* table_argb,
+ int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] /* dst_argb */
- mov esi, [esp + 4 + 8] /* table_argb */
- mov ecx, [esp + 4 + 12] /* width */
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
convertloop:
@@ -6162,27 +6233,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- int width,
- const uint8* luma, uint32 lumacoeff) {
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+ uint8* dst_argb,
+ int width,
+ const uint8* luma,
+ uint32 lumacoeff) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] /* src_argb */
- mov edi, [esp + 8 + 8] /* dst_argb */
- mov ecx, [esp + 8 + 12] /* width */
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
movd xmm2, dword ptr [esp + 8 + 16] // luma table
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
pshufd xmm2, xmm2, 0
pshufd xmm3, xmm3, 0
- pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
psllw xmm4, 8
pxor xmm5, xmm5
// 4 pixel loop.
convertloop:
- movdqu xmm0, xmmword ptr [eax] // generate luma ptr
+ movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3
phaddw xmm0, xmm0
pand xmm0, xmm4 // mask out low bits
diff --git a/files/source/scale.cc b/files/source/scale.cc
index 36e3fe52..a5c7f7ad 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -33,17 +33,24 @@ static __inline int Abs(int v) {
// This is an optimized version for scaling down a plane to 1/2 of
// its original size.
-static void ScalePlaneDown2(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
- filtering == kFilterNone ? ScaleRowDown2_C :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+ filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_C
+ : ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
src_stride = 0;
@@ -51,37 +58,47 @@ static void ScalePlaneDown2(int src_width, int src_height,
#if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
- ScaleRowDown2Box_Any_NEON);
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+ : ScaleRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
- ScaleRowDown2Box_NEON);
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_NEON
+ : ScaleRowDown2Box_NEON);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
- ScaleRowDown2Box_Any_SSSE3);
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+ : ScaleRowDown2Box_Any_SSSE3);
if (IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
- ScaleRowDown2Box_SSSE3);
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+ : ScaleRowDown2Box_SSSE3);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
- ScaleRowDown2Box_Any_AVX2);
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_AVX2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+ : ScaleRowDown2Box_Any_AVX2);
if (IS_ALIGNED(dst_width, 32)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
- ScaleRowDown2Box_AVX2);
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_AVX2
+ : ScaleRowDown2Box_AVX2);
}
}
#endif
@@ -89,8 +106,22 @@ static void ScalePlaneDown2(int src_width, int src_height,
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+ : ScaleRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MSA
+ : ScaleRowDown2Box_MSA);
+ }
}
#endif
@@ -105,18 +136,25 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
}
-static void ScalePlaneDown2_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) =
- filtering == kFilterNone ? ScaleRowDown2_16_C :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
- ScaleRowDown2Box_16_C);
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+ : ScaleRowDown2Box_16_C);
int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
src_stride = 0;
@@ -124,23 +162,25 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
#if defined(HAS_SCALEROWDOWN2_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
- ScaleRowDown2_16_NEON;
+ ScaleRowDown2 =
+ filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
}
#endif
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
- ScaleRowDown2Box_16_SSE2);
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_SSE2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+ : ScaleRowDown2Box_16_SSE2);
}
#endif
#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+ ScaleRowDown2 =
+ filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
}
#endif
@@ -159,24 +199,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
// This is an optimized version for scaling down a plane to 1/4 of
// its original size.
-static void ScalePlaneDown4(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown4(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
if (!filtering) {
src_ptr += src_stride * 2; // Point to row 2.
src_stride = 0;
}
#if defined(HAS_SCALEROWDOWN4_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
@@ -184,8 +230,8 @@ static void ScalePlaneDown4(int src_width, int src_height,
#endif
#if defined(HAS_SCALEROWDOWN4_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
}
@@ -193,8 +239,8 @@ static void ScalePlaneDown4(int src_width, int src_height,
#endif
#if defined(HAS_SCALEROWDOWN4_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
}
@@ -204,8 +250,16 @@ static void ScalePlaneDown4(int src_width, int src_height,
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+ }
}
#endif
@@ -219,38 +273,44 @@ static void ScalePlaneDown4(int src_width, int src_height,
}
}
-static void ScalePlaneDown4_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown4_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
if (!filtering) {
src_ptr += src_stride * 2; // Point to row 2.
src_stride = 0;
}
#if defined(HAS_SCALEROWDOWN4_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
- ScaleRowDown4_16_NEON;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
}
#endif
#if defined(HAS_SCALEROWDOWN4_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
- ScaleRowDown4_16_SSE2;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
}
#endif
#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
}
#endif
@@ -265,11 +325,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
}
// Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown34(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -277,6 +340,8 @@ static void ScalePlaneDown34(int src_width, int src_height,
void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
assert(dst_width % 3 == 0);
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_C;
@@ -346,8 +411,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
- dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
}
@@ -363,10 +427,14 @@ static void ScalePlaneDown34(int src_width, int src_height,
}
}
-static void ScalePlaneDown34_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown34_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
@@ -374,6 +442,8 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
assert(dst_width % 3 == 0);
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_C;
@@ -425,8 +495,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
- dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
}
@@ -442,7 +511,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
}
}
-
// Scale plane, 3/8
// This is an optimized version for scaling down a plane to 3/8
// of its original size.
@@ -458,10 +526,14 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
// ggghhhii
// Boxes are 3x3, 2x3, 3x2 and 2x2
-static void ScalePlaneDown38(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown38(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -470,6 +542,8 @@ static void ScalePlaneDown38(int src_width, int src_height,
uint8* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
assert(dst_width % 3 == 0);
+ (void)src_width;
+ (void)src_height;
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_C;
ScaleRowDown38_2 = ScaleRowDown38_C;
@@ -530,6 +604,26 @@ static void ScalePlaneDown38(int src_width, int src_height,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN38_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+ }
+ }
+ }
+#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -554,10 +648,14 @@ static void ScalePlaneDown38(int src_width, int src_height,
}
}
-static void ScalePlaneDown38_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown38_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
int y;
void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
@@ -565,6 +663,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
assert(dst_width % 3 == 0);
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_C;
@@ -654,8 +754,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
return sum;
}
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols2_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
@@ -666,13 +770,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
- scaletbl[boxwidth - minboxwidth] >> 16;
+ *dst_ptr++ =
+ SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+ 16;
}
}
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols2_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32* src_ptr,
+ uint16* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
@@ -684,12 +793,17 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
x += dx;
boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
- scaletbl[boxwidth - minboxwidth] >> 16;
+ scaletbl[boxwidth - minboxwidth] >>
+ 16;
}
}
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
- const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols0_C(int dst_width,
+ int boxheight,
+ int x,
+ int,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
src_ptr += (x >> 16);
@@ -698,8 +812,12 @@ static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
}
}
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols1_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16* src_ptr,
+ uint8* dst_ptr) {
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
@@ -710,8 +828,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
}
}
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols1_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32* src_ptr,
+ uint16* dst_ptr) {
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
@@ -728,10 +850,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
// one pixel of destination using fixed point (16.16) to step
// through source, sampling a box of pixel with simple
// averaging.
-static void ScalePlaneBox(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneBox(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -739,16 +865,16 @@ static void ScalePlaneBox(int src_width, int src_height,
int dx = 0;
int dy = 0;
const int max_y = (src_height << 16);
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
{
// Allocate a row buffer of uint16.
align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) =
- (dx & 0xffff) ? ScaleAddCols2_C:
- ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+ const uint16* src_ptr, uint8* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_C
+ : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
ScaleAddRow_C;
#if defined(HAS_SCALEADDROW_SSE2)
@@ -775,6 +901,22 @@ static void ScalePlaneBox(int src_width, int src_height,
}
}
#endif
+#if defined(HAS_SCALEADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleAddRow = ScaleAddRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_DSPR2)
+ if (TestCpuFlag(kCpuHasDSPR2)) {
+ ScaleAddRow = ScaleAddRow_Any_DSPR2;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_DSPR2;
+ }
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
@@ -787,7 +929,7 @@ static void ScalePlaneBox(int src_width, int src_height,
boxheight = MIN1((y >> 16) - iy);
memset(row16, 0, src_width * 2);
for (k = 0; k < boxheight; ++k) {
- ScaleAddRow(src, (uint16 *)(row16), src_width);
+ ScaleAddRow(src, (uint16*)(row16), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
@@ -797,10 +939,14 @@ static void ScalePlaneBox(int src_width, int src_height,
}
}
-static void ScalePlaneBox_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneBox_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -808,15 +954,15 @@ static void ScalePlaneBox_16(int src_width, int src_height,
int dx = 0;
int dy = 0;
const int max_y = (src_height << 16);
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
{
// Allocate a row buffer of uint32.
align_buffer_64(row32, src_width * 4);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) =
- (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+ const uint32* src_ptr, uint16* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
ScaleAddRow_16_C;
@@ -837,7 +983,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
boxheight = MIN1((y >> 16) - iy);
memset(row32, 0, src_width * 4);
for (k = 0; k < boxheight; ++k) {
- ScaleAddRow(src, (uint32 *)(row32), src_width);
+ ScaleAddRow(src, (uint32*)(row32), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
@@ -848,10 +994,14 @@ static void ScalePlaneBox_16(int src_width, int src_height,
}
// Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -864,14 +1014,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) =
+ void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+ int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -906,7 +1056,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
}
#endif
-
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_SCALEFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -944,10 +1101,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
free_aligned_buffer_64(row);
}
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearDown_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -960,14 +1121,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) =
+ void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
- void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
+ void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1011,7 +1172,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
#endif
-
#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_16_SSSE3;
@@ -1041,10 +1201,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
// Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr,
enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1053,14 +1217,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) =
+ void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+ int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -1172,10 +1336,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
}
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearUp_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr,
enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1184,14 +1352,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
- void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) =
+ void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1308,20 +1476,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
-static void ScalePlaneSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_ptr,
+ uint8* dst_ptr) {
int i;
- void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) = ScaleCols_C;
+ void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x,
+ int dx) = ScaleCols_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
int dx = 0;
int dy = 0;
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1340,20 +1512,24 @@ static void ScalePlaneSimple(int src_width, int src_height,
}
}
-static void ScalePlaneSimple_16(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneSimple_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_ptr,
+ uint16* dst_ptr) {
int i;
- void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) = ScaleCols_16_C;
+ void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width,
+ int x, int dx) = ScaleCols_16_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
int dx = 0;
int dy = 0;
- ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1366,8 +1542,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
}
for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
- dst_width, x, dx);
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
}
@@ -1377,14 +1552,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
// This function dispatches to a specialized scaler based on scale factor.
LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
+void ScalePlane(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering) {
// Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height, filtering);
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
// Negative height means invert the image.
if (src_height < 0) {
@@ -1403,46 +1582,42 @@ void ScalePlane(const uint8* src, int src_stride,
if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled horizontally.
- ScalePlaneVertical(src_height,
- dst_width, dst_height,
- src_stride, dst_stride, src, dst,
- 0, 0, dy, 1, filtering);
+ ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
- if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
// optimized, 3/4
- ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
return;
}
if (2 * dst_width == src_width && 2 * dst_height == src_height) {
// optimized, 1/2
- ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
return;
}
// 3/8 rounded up for odd sized chroma height.
- if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
// optimized, 3/8
- ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
return;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
(filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
- ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
return;
}
}
if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
return;
}
if (filtering && dst_height > src_height) {
@@ -1455,19 +1630,23 @@ void ScalePlane(const uint8* src, int src_stride,
src_stride, dst_stride, src, dst, filtering);
return;
}
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
}
LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
- int src_width, int src_height,
- uint16* dst, int dst_stride,
- int dst_width, int dst_height,
- enum FilterMode filtering) {
+void ScalePlane_16(const uint16* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
// Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height, filtering);
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
// Negative height means invert the image.
if (src_height < 0) {
@@ -1486,16 +1665,13 @@ void ScalePlane_16(const uint16* src, int src_stride,
if (dst_width == src_width) {
int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically.
- ScalePlaneVertical_16(src_height,
- dst_width, dst_height,
- src_stride, dst_stride, src, dst,
- 0, 0, dy, 1, filtering);
+ ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
- if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
// optimized, 3/4
ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1508,15 +1684,14 @@ void ScalePlane_16(const uint16* src, int src_stride,
return;
}
// 3/8 rounded up for odd sized chroma height.
- if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
// optimized, 3/8
ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
return;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
- filtering != kFilterBilinear) {
+ filtering != kFilterBilinear) {
// optimized, 1/4
ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1524,8 +1699,8 @@ void ScalePlane_16(const uint16* src, int src_stride,
}
}
if (filtering == kFilterBox && dst_height * 2 < src_height) {
- ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
+ ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
return;
}
if (filtering && dst_height > src_height) {
@@ -1538,101 +1713,121 @@ void ScalePlane_16(const uint16* src, int src_stride,
src_stride, dst_stride, src, dst, filtering);
return;
}
- ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst);
+ ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
}
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.
LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
+int I420Scale(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ int dst_stride_y,
+ uint8* dst_u,
+ int dst_stride_u,
+ uint8* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering) {
int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
- src_width > 32768 || src_height > 32768 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
return 0;
}
LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
- const uint16* src_u, int src_stride_u,
- const uint16* src_v, int src_stride_v,
- int src_width, int src_height,
- uint16* dst_y, int dst_stride_y,
- uint16* dst_u, int dst_stride_u,
- uint16* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
+int I420Scale_16(const uint16* src_y,
+ int src_stride_y,
+ const uint16* src_u,
+ int src_stride_u,
+ const uint16* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16* dst_y,
+ int dst_stride_y,
+ uint16* dst_u,
+ int dst_stride_u,
+ uint16* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering) {
int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
- src_width > 32768 || src_height > 32768 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScalePlane_16(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
return 0;
}
// Deprecated api
LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
- int src_stride_y, int src_stride_u, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, uint8* dst_u, uint8* dst_v,
- int dst_stride_y, int dst_stride_u, int dst_stride_v,
- int dst_width, int dst_height,
+int Scale(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8* dst_y,
+ uint8* dst_u,
+ uint8* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
LIBYUV_BOOL interpolate) {
- return I420Scale(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- src_width, src_height,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- dst_width, dst_height,
- interpolate ? kFilterBox : kFilterNone);
+ return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+ dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+ dst_height, interpolate ? kFilterBox : kFilterNone);
}
// Deprecated api
LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+int ScaleOffset(const uint8* src,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_width,
+ int dst_height,
+ int dst_yoffset,
LIBYUV_BOOL interpolate) {
// Chroma requires offset to multiple of 2.
int dst_yoffset_even = dst_yoffset & ~1;
@@ -1643,26 +1838,21 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
int aheight = dst_height - dst_yoffset_even * 2; // actual output height
const uint8* src_y = src;
const uint8* src_u = src + src_width * src_height;
- const uint8* src_v = src + src_width * src_height +
- src_halfwidth * src_halfheight;
+ const uint8* src_v =
+ src + src_width * src_height + src_halfwidth * src_halfheight;
uint8* dst_y = dst + dst_yoffset_even * dst_width;
- uint8* dst_u = dst + dst_width * dst_height +
- (dst_yoffset_even >> 1) * dst_halfwidth;
+ uint8* dst_u =
+ dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
(dst_yoffset_even >> 1) * dst_halfwidth;
- if (!src || src_width <= 0 || src_height <= 0 ||
- !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+ if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
+ dst_height <= 0 || dst_yoffset_even < 0 ||
dst_yoffset_even >= dst_height) {
return -1;
}
- return I420Scale(src_y, src_width,
- src_u, src_halfwidth,
- src_v, src_halfwidth,
- src_width, src_height,
- dst_y, dst_width,
- dst_u, dst_halfwidth,
- dst_v, dst_halfwidth,
- dst_width, aheight,
+ return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth,
+ src_width, src_height, dst_y, dst_width, dst_u,
+ dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight,
interpolate ? kFilterBox : kFilterNone);
}
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
index ed76a9e4..d64ba7a9 100644
--- a/files/source/scale_any.cc
+++ b/files/source/scale_any.cc
@@ -19,16 +19,15 @@ extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
- int dst_width, int x, int dx) { \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, \
- dst_width & MASK, x + n * dx, dx); \
- }
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx); \
+ }
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
@@ -37,167 +36,378 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
- ScaleARGBFilterCols_C, 4, 3)
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
#endif
#undef CANY
// Fixed scale down.
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
- uint8* dst_ptr, int dst_width) { \
- int r = (int)((unsigned int)dst_width % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
- }
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
- uint8* dst_ptr, int dst_width) { \
- int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
- }
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
- ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
- 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
- ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+ ScaleRowDown2Linear_SSSE3,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
- ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
- 2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
- 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+ ScaleRowDown2Linear_AVX2,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
- ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
- ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
- ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+ ScaleRowDown2Linear_NEON,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+ ScaleRowDown2Linear_MSA,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+ ScaleRowDown2Box_MSA,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
- 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+ ScaleRowDown4Box_SSSE3,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
- 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+ ScaleRowDown4Box_AVX2,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
- 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+ ScaleRowDown4Box_NEON,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+ ScaleRowDown4Box_MSA,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
- ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
- ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
- ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_SSSE3,
+ ScaleRowDown34_SSSE3,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+ ScaleRowDown34_0_Box_SSSE3,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+ ScaleRowDown34_1_Box_SSSE3,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
- ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
- ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
- ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_NEON,
+ ScaleRowDown34_NEON,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+ ScaleRowDown34_0_Box_NEON,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+ ScaleRowDown34_1_Box_NEON,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
- ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
- ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
- ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_Any_SSSE3,
+ ScaleRowDown38_SSSE3,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+ ScaleRowDown38_3_Box_SSSE3,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+ ScaleRowDown38_2_Box_SSSE3,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
- ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
- ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
- ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_Any_NEON,
+ ScaleRowDown38_NEON,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+ ScaleRowDown38_3_Box_NEON,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+ ScaleRowDown38_2_Box_NEON,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+ ScaleRowDown38_MSA,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+ ScaleRowDown38_3_Box_MSA,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+ ScaleRowDown38_2_Box_MSA,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
- ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
- ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
- ScaleARGBRowDown2Box_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+ ScaleARGBRowDown2_SSE2,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+ ScaleARGBRowDown2Linear_SSE2,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+ ScaleARGBRowDown2Box_SSE2,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
- ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
- ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
- ScaleARGBRowDown2Box_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2_Any_NEON,
+ ScaleARGBRowDown2_NEON,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+ ScaleARGBRowDown2Linear_NEON,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+ ScaleARGBRowDown2Box_NEON,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+ ScaleARGBRowDown2_MSA,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+ ScaleARGBRowDown2Linear_MSA,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+ ScaleARGBRowDown2Box_MSA,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
#endif
#undef SDANY
// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
- uint8* dst_ptr, int dst_width) { \
- int r = (int)((unsigned int)dst_width % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
- src_stepx, dst_ptr + n * BPP, r); \
- }
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8* dst_ptr, int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+ dst_ptr + n * BPP, r); \
+ }
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
- ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
- ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+ ScaleARGBRowDownEven_SSE2,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+ ScaleARGBRowDownEvenBox_SSE2,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
- ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
- ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+ ScaleARGBRowDownEven_NEON,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+ ScaleARGBRowDownEvenBox_NEON,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+ ScaleARGBRowDownEven_MSA,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+ ScaleARGBRowDownEvenBox_MSA,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
#endif
// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
- void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
- int n = src_width & ~MASK; \
- if (n > 0) { \
- SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
- } \
- SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
- }
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+ }
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@@ -208,14 +418,15 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_DSPR2
+SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
+#endif
#undef SAANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-
-
-
-
-
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 17f51ae9..1ea28f0d 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -30,20 +30,31 @@ static __inline int Abs(int v) {
// ScaleARGB ARGB, 1/2
// This is an optimized version for scaling down a ARGB to 1/2 of
// its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
+static void ScaleARGBDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
enum FilterMode filtering) {
int j;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
- filtering == kFilterNone ? ScaleARGBRowDown2_C :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
- ScaleARGBRowDown2Box_C);
- assert(dx == 65536 * 2); // Test scale factor of 2.
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_C
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+ : ScaleARGBRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
@@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height,
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
- ScaleARGBRowDown2Box_Any_SSE2);
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+ : ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
- ScaleARGBRowDown2Box_SSE2);
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+ : ScaleARGBRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
- ScaleARGBRowDown2Box_Any_NEON);
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+ : ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
- ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
- (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
- ScaleARGBRowDown2Box_NEON);
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+ : ScaleARGBRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+ : ScaleARGBRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+ : ScaleARGBRowDown2Box_MSA);
}
}
#endif
@@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height,
// ScaleARGB ARGB, 1/4
// This is an optimized version for scaling down a ARGB to 1/4 of
// its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
+static void ScaleARGBDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+ uint8* dst_argb, int dst_width) =
+ ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
- assert(dx == 65536 * 4); // Test scale factor of 4.
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
- ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
- row + kRowSize, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
@@ -137,11 +183,18 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
// ScaleARGB ARGB Even
// This is an optimized version for scaling down a ARGB to even
// multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
+static void ScaleARGBDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
@@ -149,26 +202,38 @@ static void ScaleARGBDownEven(int src_width, int src_height,
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
- ScaleARGBRowDownEven_Any_SSE2;
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+ : ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
- ScaleARGBRowDownEven_SSE2;
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
- ScaleARGBRowDownEven_Any_NEON;
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+ : ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
- ScaleARGBRowDownEven_NEON;
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+ : ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
}
}
#endif
@@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
}
// Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
+static void ScaleARGBBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64 xlast = x + (int64)(dst_width - 1) * dx;
int64 xl = (dx >= 0) ? x : xlast;
int64 xr = (dx >= 0) ? xlast : x;
int clip_src_width;
- xl = (xl >> 16) & ~3; // Left edge aligned.
- xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
if (xr > src_width) {
xr = src_width;
@@ -235,14 +307,22 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -286,18 +366,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
// Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
+static void ScaleARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -325,14 +412,22 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
if (src_width >= 32768) {
- ScaleARGBFilterCols = filtering ?
- ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -423,8 +518,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
#ifdef YUVSCALEUP
// Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
- int dst_width, int dst_height,
+static void ScaleYUVToARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
int src_stride_y,
int src_stride_u,
int src_stride_v,
@@ -433,14 +530,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- int x, int dx, int y, int dy,
+ int x,
+ int dx,
+ int y,
+ int dy,
enum FilterMode filtering) {
int j;
- void (*I422ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) = I422ToARGBRow_C;
+ void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+ const uint8* v_buf, uint8* rgb_buf, int width) =
+ I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -474,10 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -503,18 +609,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
- void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
+ void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
- ScaleARGBFilterCols = filtering ?
- ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -635,15 +749,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
-static void ScaleARGBSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
+static void ScaleARGBSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
int j;
- void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) =
+ void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
+ int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+ (void)src_height;
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2;
@@ -667,8 +789,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
}
for (j = 0; j < dst_height; ++j) {
- ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
- dst_width, x, dx);
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+ dx);
dst_argb += dst_stride;
y += dy;
}
@@ -677,11 +799,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
+static void ScaleARGB(const uint8* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -690,8 +819,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
int dy = 0;
// ARGB does not support box filter yet, but allow the user to pass it.
// Simplify filtering when possible.
- filtering = ScaleFilterReduce(src_width, src_height,
- dst_width, dst_height,
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
// Negative src_height means invert the image.
@@ -700,17 +828,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
src = src + (src_height - 1) * src_stride;
src_stride = -src_stride;
}
- ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
- &x, &y, &dx, &dy);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
src_width = Abs(src_width);
if (clip_x) {
- int64 clipf = (int64)(clip_x) * dx;
+ int64 clipf = (int64)(clip_x)*dx;
x += (clipf & 0xffff);
src += (clipf >> 16) * 4;
dst += clip_x * 4;
}
if (clip_y) {
- int64 clipf = (int64)(clip_y) * dy;
+ int64 clipf = (int64)(clip_y)*dy;
y += (clipf & 0xffff);
src += (clipf >> 16) * src_stride;
dst += clip_y * dst_stride;
@@ -725,24 +853,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
if (!(dx & 0x10000) && !(dy & 0x10000)) {
if (dx == 0x20000) {
// Optimized 1/2 downsample.
- ScaleARGBDown2(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
+ ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
return;
}
if (dx == 0x40000 && filtering == kFilterBox) {
// Optimized 1/4 box downsample.
- ScaleARGBDown4Box(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy);
+ ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
return;
}
- ScaleARGBDownEven(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
+ ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
return;
}
// Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -759,96 +883,105 @@ static void ScaleARGB(const uint8* src, int src_stride,
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
// Arbitrary scale vertically, but unscaled vertically.
- ScalePlaneVertical(src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, y, dy, 4, filtering);
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
if (filtering && dy < 65536) {
- ScaleARGBBilinearUp(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
+ ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
return;
}
if (filtering) {
- ScaleARGBBilinearDown(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy, filtering);
+ ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
return;
}
- ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src, dst,
- x, dx, y, dy);
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
}
LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering) {
- if (!src_argb || src_width == 0 || src_height == 0 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0 ||
- clip_x < 0 || clip_y < 0 ||
+ if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+ dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
}
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
- dst_argb, dst_stride_argb, dst_width, dst_height,
- clip_x, clip_y, clip_width, clip_height, filtering);
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+ clip_height, filtering);
return 0;
}
// Scale an ARGB image.
LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
+int ARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering) {
- if (!src_argb || src_width == 0 || src_height == 0 ||
- src_width > 32768 || src_height > 32768 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
- ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
- dst_argb, dst_stride_argb, dst_width, dst_height,
- 0, 0, dst_width, dst_height, filtering);
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+ filtering);
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
+int YUVToARGBScaleClip(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
uint32 src_fourcc,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
uint32 dst_fourcc,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
- I420ToARGB(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- argb_buffer, src_width * 4,
- src_width, src_height);
-
- r = ARGBScaleClip(argb_buffer, src_width * 4,
- src_width, src_height,
- dst_argb, dst_stride_argb,
- dst_width, dst_height,
- clip_x, clip_y, clip_width, clip_height,
- filtering);
+ (void)src_fourcc; // TODO(fbarchard): implement and/or assert.
+ (void)dst_fourcc;
+ I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ argb_buffer, src_width * 4, src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
index 3507aa4d..1bef39df 100644
--- a/files/source/scale_common.cc
+++ b/files/source/scale_common.cc
@@ -28,9 +28,12 @@ static __inline int Abs(int v) {
}
// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
@@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown2_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[1];
dst[1] = src_ptr[3];
@@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
const uint8* s = src_ptr;
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + 1) >> 1;
dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
const uint16* s = src_ptr;
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + 1) >> 1;
dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -86,8 +98,10 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@@ -103,8 +117,10 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@@ -125,8 +141,10 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
dst[0] = (s[0] + t[0] + 1) >> 1;
}
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown4_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
@@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown4_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src_ptr[2];
dst[1] = src_ptr[6];
@@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown4Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
}
}
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
}
}
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown34_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown34_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@@ -269,8 +303,10 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@@ -291,8 +327,10 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width) {
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@@ -314,8 +352,10 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
@@ -336,8 +376,10 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* d, int dst_width) {
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* d,
+ int dst_width) {
const uint16* s = src_ptr;
const uint16* t = src_ptr + src_stride;
int x;
@@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[0] = src_ptr[x >> 16];
@@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleColsUp2_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
+ (void)x;
+ (void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
src_ptr += 1;
@@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleColsUp2_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
+ (void)x;
+ (void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst_ptr[1] = dst_ptr[0] = src_ptr[0];
src_ptr += 1;
@@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
- ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+ (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
- (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+ (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleFilterCols_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@@ -450,8 +511,11 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x32, int dx) {
+void ScaleFilterCols64_C(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@@ -477,11 +541,14 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
#undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
- ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+ (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleFilterCols_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
@@ -504,8 +571,11 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
- int dst_width, int x32, int dx) {
+void ScaleFilterCols64_16_C(uint16* dst_ptr,
+ const uint16* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
int64 x = (int64)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
@@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
#undef BLENDER
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown38_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst, int dst_width) {
+void ScaleRowDown38_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst,
+ int dst_width) {
int x;
+ (void)src_stride;
assert(dst_width % 3 == 0);
for (x = 0; x < dst_width; x += 3) {
dst[0] = src_ptr[0];
@@ -559,25 +635,29 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
// 8x3 -> 3x1
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
src_ptr += 8;
dst_ptr += 3;
}
@@ -585,66 +665,80 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) {
+ uint16* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
src_ptr += 8;
dst_ptr += 3;
}
}
// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
src_ptr += 8;
dst_ptr += 3;
}
}
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
src_ptr += 8;
dst_ptr += 3;
}
@@ -680,11 +774,12 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
-
int x;
+ (void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[1];
dst[1] = src[3];
@@ -698,8 +793,10 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
int x;
+ (void)src_stride;
for (x = 0; x < dst_width; ++x) {
dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@@ -710,29 +807,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
}
}
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
src_argb += 8;
dst_argb += 4;
}
}
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
-
+ (void)src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = src[0];
@@ -748,25 +853,33 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
src_argb += src_stepx * 4;
dst_argb += 4;
}
}
// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@@ -782,8 +895,11 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x32, int dx) {
+void ScaleARGBCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@@ -801,11 +917,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
}
// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBColsUp2_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
+ (void)x;
+ (void)dx;
for (j = 0; j < dst_width - 1; j += 2) {
dst[1] = dst[0] = src[0];
src += 1;
@@ -818,15 +939,18 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
- BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
- BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
- BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+ BLENDERC(a, b, f, 0)
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
int j;
@@ -854,8 +978,11 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
}
}
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x32, int dx) {
+void ScaleARGBFilterCols64_C(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
int64 x = (int64)(x32);
const uint32* src = (const uint32*)(src_argb);
uint32* dst = (uint32*)(dst_argb);
@@ -889,16 +1016,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int y, int dy,
- int bpp, enum FilterMode filtering) {
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8* src_argb,
+ uint8* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
- void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_C;
+ void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(bpp >= 1 && bpp <= 4);
@@ -931,15 +1064,23 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int yi;
int yf;
@@ -948,23 +1089,29 @@ void ScalePlaneVertical(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
- InterpolateRow(dst_argb, src_argb + yi * src_stride,
- src_stride, dst_width_bytes, yf);
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_bytes, yf);
dst_argb += dst_stride;
y += dy;
}
}
void ScalePlaneVertical_16(int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint16* src_argb, uint16* dst_argb,
- int x, int y, int dy,
- int wpp, enum FilterMode filtering) {
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16* src_argb,
+ uint16* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
- void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
- ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
- InterpolateRow_16_C;
+ void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
int j;
assert(wpp >= 1 && wpp <= 2);
@@ -1005,9 +1152,9 @@ void ScalePlaneVertical_16(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+ IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
@@ -1022,16 +1169,18 @@ void ScalePlaneVertical_16(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
- InterpolateRow(dst_argb, src_argb + yi * src_stride,
- src_stride, dst_width_words, yf);
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
}
}
// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
- int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering) {
if (src_width < 0) {
src_width = -src_width;
@@ -1078,17 +1227,21 @@ int FixedDiv_C(int num, int div) {
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
- return (int)((((int64)(num) << 16) - 0x00010001) /
- (div - 1));
+ return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
}
#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
// Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
- int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
enum FilterMode filtering,
- int* x, int* y, int* dx, int* dy) {
+ int* x,
+ int* y,
+ int* dx,
+ int* dy) {
assert(x != NULL);
assert(y != NULL);
assert(dx != NULL);
@@ -1120,7 +1273,7 @@ void ScaleSlope(int src_width, int src_height,
*x = 0;
}
if (dst_height <= src_height) {
- *dy = FixedDiv(src_height, dst_height);
+ *dy = FixedDiv(src_height, dst_height);
*y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
} else if (dst_height > 1) {
*dy = FixedDiv1(src_height, dst_height);
diff --git a/files/source/scale_mips.cc b/files/source/scale_dspr2.cc
index ae953073..ddedcbf4 100644
--- a/files/source/scale_mips.cc
+++ b/files/source/scale_dspr2.cc
@@ -17,168 +17,167 @@ extern "C" {
#endif
// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
__asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
- "beqz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- // TODO(fbarchard): Use odd pixels instead of even.
- "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
- "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
- "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
- "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t8, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t1, 8(%[dst]) \n"
- "sw $t2, 12(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0xf \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t0, 0(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 2 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
+ "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
+ "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
+ "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 1(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
const uint8* t = src_ptr + src_stride;
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
- "bltz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 0(%[t]) \n" // |19|18|17|16|
- "lw $t5, 4(%[t]) \n" // |23|22|21|20|
- "lw $t6, 8(%[t]) \n" // |27|26|25|24|
- "lw $t7, 12(%[t]) \n" // |31|30|29|28|
- "addiu $t9, $t9, -1 \n"
- "srl $t8, $t0, 16 \n" // |X|X|3|2|
- "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
- "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
- "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
- "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
- "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
- "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
- "srl $t8, $t1, 16 \n" // |X|X|7|6|
- "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
- "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
- "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
- "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
- "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
- "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
- "srl $t8, $t2, 16 \n" // |X|X|11|10|
- "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
- "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
- "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
- "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
- "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
- "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
- "srl $t8, $t3, 16 \n" // |X|X|15|14|
- "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
- "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
- "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
- "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
- "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
- "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
- "addiu %[src_ptr], %[src_ptr], 16 \n"
- "addiu %[t], %[t], 16 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "sb $t1, 2(%[dst]) \n"
- "sb $t5, 3(%[dst]) \n"
- "sb $t2, 4(%[dst]) \n"
- "sb $t6, 5(%[dst]) \n"
- "sb $t3, 6(%[dst]) \n"
- "sb $t7, 7(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0x7 \n" // x = residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lwr $t1, 0(%[src_ptr]) \n"
- "lwl $t1, 3(%[src_ptr]) \n"
- "lwr $t2, 0(%[t]) \n"
- "lwl $t2, 3(%[t]) \n"
- "srl $t8, $t1, 16 \n"
- "ins $t1, $t2, 16, 16 \n"
- "ins $t2, $t8, 0, 16 \n"
- "raddu.w.qb $t1, $t1 \n"
- "raddu.w.qb $t2, $t2 \n"
- "shra_r.w $t1, $t1, 2 \n"
- "shra_r.w $t2, $t2, 2 \n"
- "sb $t1, 0(%[dst]) \n"
- "sb $t2, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -2 \n"
- "addiu %[t], %[t], 4 \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 2 \n"
-
- "3: \n"
- ".set pop \n"
-
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst), [t] "+r" (t)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
+void ScaleRowDown4_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
@@ -186,7 +185,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
- "1: \n"
+ "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@@ -199,8 +198,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
- "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
- "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
+ "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
+ "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t1, 0(%[dst]) \n"
@@ -208,44 +207,43 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
- "2: \n"
+ "2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
- "21: \n"
- "lbu $t1, 0(%[src_ptr]) \n"
+ "21: \n"
+ "lbu $t1, 2(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
"sb $t1, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
- "3: \n"
+ "3: \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst)
- : [dst_width] "r" (dst_width)
- : "t1", "t2", "t3", "t4", "t5",
- "t6", "t7", "t8", "t9"
- );
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+ : [dst_width] "r"(dst_width)
+ : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
@@ -299,23 +297,20 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"2: \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [s1] "+r" (s1),
- [s2] "+r" (s2),
- [s3] "+r" (s3)
- : [dst_width] "r" (dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6","t7", "t8", "t9"
- );
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
+ [s3] "+r"(s3)
+ : [dst_width] "r"(dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
+void ScaleRowDown34_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
- "1: \n"
+ "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
@@ -347,23 +342,21 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [dst_width] "+r" (dst_width)
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
- : "t0", "t1", "t2", "t3", "t4", "t5",
- "t6","t7", "t8", "t9"
- );
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
}
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- __asm__ __volatile__ (
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
@@ -400,26 +393,24 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
- "3: \n"
+ "3: \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [src_stride] "+r" (src_stride),
- [d] "+r" (d),
- [dst_width] "+r" (dst_width)
+ : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+ [dst_width] "+r"(dst_width)
:
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6"
- );
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- __asm__ __volatile__ (
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* d,
+ int dst_width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
@@ -452,25 +443,23 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
- "3: \n"
+ "3: \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [src_stride] "+r" (src_stride),
- [d] "+r" (d),
- [dst_width] "+r" (dst_width)
+ : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+ [dst_width] "+r"(dst_width)
:
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6"
- );
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- __asm__ __volatile__ (
+void ScaleRowDown38_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
@@ -501,26 +490,24 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst] "+r" (dst),
- [dst_width] "+r" (dst_width)
+ : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
:
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t6", "t7", "t8"
- );
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
}
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
@@ -554,18 +541,16 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst_ptr] "+r" (dst_ptr),
- [t] "+r" (t),
- [dst_width] "+r" (dst_width)
- : [c] "r" (c)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
- );
+ : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
+ [dst_width] "+r"(dst_width)
+ : [c] "r"(c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
@@ -573,11 +558,11 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
- __asm__ __volatile__ (
+ __asm__ __volatile__(
".set push \n"
".set noreorder \n"
- "1: \n"
+ "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
@@ -624,15 +609,55 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
- : [src_ptr] "+r" (src_ptr),
- [dst_ptr] "+r" (dst_ptr),
- [s1] "+r" (s1),
- [s2] "+r" (s2),
- [dst_width] "+r" (dst_width)
- : [c1] "r" (c1), [c2] "r" (c2)
- : "t0", "t1", "t2", "t3", "t4",
- "t5", "t6", "t7", "t8"
- );
+ : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
+ [s2] "+r"(s2), [dst_width] "+r"(dst_width)
+ : [c1] "r"(c1), [c2] "r"(c2)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
+}
+
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+ int x;
+ for (x = 0; x < ((src_width - 1)); x += 8) {
+ uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
+ uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+ "lw %[tmp_t5], 0(%[src_ptr]) \n"
+ "lw %[tmp_t6], 4(%[src_ptr]) \n"
+ "lw %[tmp_t1], 0(%[dst_ptr]) \n"
+ "lw %[tmp_t2], 4(%[dst_ptr]) \n"
+ "lw %[tmp_t3], 8(%[dst_ptr]) \n"
+ "lw %[tmp_t4], 12(%[dst_ptr]) \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
+ "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
+ "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
+ "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
+ "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
+ "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
+ "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
+ "sw %[tmp_t1], 0(%[dst_ptr]) \n"
+ "sw %[tmp_t2], 4(%[dst_ptr]) \n"
+ "sw %[tmp_t3], 8(%[dst_ptr]) \n"
+ "sw %[tmp_t4], 12(%[dst_ptr]) \n"
+ ".set pop \n"
+ :
+ [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
+ [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+ [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
+ : [dst_ptr] "r"(dst_ptr));
+ src_ptr += 8;
+ dst_ptr += 8;
+ }
+
+ if ((src_width)&7) {
+ for (x = 0; x < ((src_width - 1) & 7); x += 1) {
+ dst_ptr[0] += src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 1;
+ }
+ }
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -641,4 +666,3 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
index e2f88544..f0ac56fc 100644
--- a/files/source/scale_gcc.cc
+++ b/files/source/scale_gcc.cc
@@ -21,85 +21,82 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-static uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
-static uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@@ -120,8 +117,11 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@@ -149,8 +149,10 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
@@ -189,8 +191,11 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@@ -213,8 +218,11 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@@ -244,8 +252,10 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@@ -286,8 +296,11 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
@@ -314,8 +327,10 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
intptr_t stridex3;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
@@ -368,10 +383,12 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-
#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
@@ -400,8 +417,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
@@ -455,17 +474,20 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
#endif // HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kShuf0), // %0
- "m"(kShuf1), // %1
- "m"(kShuf2) // %2
- );
+void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
asm volatile (
LABELALIGN
"1: \n"
@@ -492,25 +514,26 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
asm volatile (
LABELALIGN
"1: \n"
@@ -557,25 +580,26 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
asm volatile (
LABELALIGN
@@ -624,8 +648,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
);
}
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@@ -655,18 +682,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
- :
- : "m"(kShufAb0), // %0
- "m"(kShufAb1), // %1
- "m"(kShufAb2), // %2
- "m"(kScaleAb2) // %3
- );
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
asm volatile (
LABELALIGN
"1: \n"
@@ -700,17 +728,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- :
- : "m"(kShufAc), // %0
- "m"(kShufAc3), // %1
- "m"(kScaleAc33) // %2
- );
+ uint8* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
asm volatile (
LABELALIGN
"1: \n"
@@ -790,7 +819,6 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
);
}
-
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
@@ -823,17 +851,19 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
-static uvec8 kFsub80 =
- { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
- { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile (
"movd %6,%%xmm2 \n"
@@ -867,7 +897,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
@@ -925,8 +955,13 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
asm volatile (
LABELALIGN
"1: \n"
@@ -950,7 +985,9 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@@ -971,7 +1008,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
LABELALIGN
"1: \n"
@@ -995,7 +1034,8 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@@ -1025,10 +1065,14 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
+ (void)src_stride;
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
@@ -1059,8 +1103,10 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride, int src_stepx,
- uint8* dst_argb, int dst_width) {
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
@@ -1102,8 +1148,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
);
}
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
intptr_t x0, x1;
asm volatile (
"movd %5,%%xmm2 \n"
@@ -1171,8 +1220,13 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
asm volatile (
LABELALIGN
"1: \n"
@@ -1197,26 +1251,29 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
intptr_t x0, x1;
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
- :
- : "m"(kShuffleColARGB), // %0
- "m"(kShuffleFractions) // %1
- );
+ asm volatile(
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
asm volatile (
"movd %5,%%xmm2 \n"
@@ -1283,34 +1340,32 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
- asm volatile (
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx"
- );
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
return num;
}
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
- asm volatile (
- "cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
- : "+a"(num) // %0
- : "c"(div) // %1
- : "memory", "cc", "edx"
- );
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
return num;
}
diff --git a/files/source/scale_msa.cc b/files/source/scale_msa.cc
new file mode 100644
index 00000000..bfcd10fc
--- /dev/null
+++ b/files/source/scale_msa.cc
@@ -0,0 +1,553 @@
+/*
+ * Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ ST_UB(dst0, dst_argb);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, vec0, vec1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+ vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+ dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+ ST_UB(dst0, dst_argb);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+ v8u16 reg0, reg1, reg2, reg3;
+ v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+ for (x = 0; x < dst_width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+ vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+ vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+ reg0 = __msa_hadd_u_h(vec0, vec0);
+ reg1 = __msa_hadd_u_h(vec1, vec1);
+ reg2 = __msa_hadd_u_h(vec2, vec2);
+ reg3 = __msa_hadd_u_h(vec3, vec3);
+ reg0 += reg2;
+ reg1 += reg3;
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+ reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ ST_UB(dst0, dst_argb);
+ s += 32;
+ t += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int32_t stepx = src_stepx * 4;
+ int32_t data0, data1, data2, data3;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 4) {
+ data0 = LW(src_argb);
+ data1 = LW(src_argb + stepx);
+ data2 = LW(src_argb + stepx * 2);
+ data3 = LW(src_argb + stepx * 3);
+ SW(data0, dst_argb);
+ SW(data1, dst_argb + 4);
+ SW(data2, dst_argb + 8);
+ SW(data3, dst_argb + 12);
+ src_argb += stepx * 4;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ int x;
+ const uint8* nxt_argb = src_argb + src_stride;
+ int32_t stepx = src_stepx * 4;
+ int64_t data0, data1, data2, data3;
+ v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+ v16u8 vec0, vec1, vec2, vec3;
+ v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 dst0;
+
+ for (x = 0; x < dst_width; x += 4) {
+ data0 = LD(src_argb);
+ data1 = LD(src_argb + stepx);
+ data2 = LD(src_argb + stepx * 2);
+ data3 = LD(src_argb + stepx * 3);
+ src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+ src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+ src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+ src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+ data0 = LD(nxt_argb);
+ data1 = LD(nxt_argb + stepx);
+ data2 = LD(nxt_argb + stepx * 2);
+ data3 = LD(nxt_argb + stepx * 3);
+ src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+ src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+ src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+ src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+ vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ reg0 = __msa_hadd_u_h(vec0, vec0);
+ reg1 = __msa_hadd_u_h(vec1, vec1);
+ reg2 = __msa_hadd_u_h(vec2, vec2);
+ reg3 = __msa_hadd_u_h(vec3, vec3);
+ reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+ reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+ reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+ reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+ reg4 += reg6;
+ reg5 += reg7;
+ reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+ reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ ST_UB(dst0, dst_argb);
+ src_argb += stepx * 4;
+ nxt_argb += stepx * 4;
+ dst_argb += 16;
+ }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ dst0 = __msa_aver_u_b(vec1, vec0);
+ dst1 = __msa_aver_u_b(vec3, vec2);
+ ST_UB2(dst0, dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3;
+
+ for (x = 0; x < dst_width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ vec0 = __msa_hadd_u_h(src0, src0);
+ vec1 = __msa_hadd_u_h(src1, src1);
+ vec2 = __msa_hadd_u_h(src2, src2);
+ vec3 = __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+ vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+ vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+ vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ ST_UB2(dst0, dst1, dst, 16);
+ s += 64;
+ t += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < dst_width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst);
+ src_ptr += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t0 = s + src_stride;
+ const uint8_t* t1 = s + src_stride * 2;
+ const uint8_t* t2 = s + src_stride * 3;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+ v8u16 vec0, vec1, vec2, vec3;
+ v4u32 reg0, reg1, reg2, reg3;
+
+ for (x = 0; x < dst_width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+ vec0 = __msa_hadd_u_h(src0, src0);
+ vec1 = __msa_hadd_u_h(src1, src1);
+ vec2 = __msa_hadd_u_h(src2, src2);
+ vec3 = __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+ vec0 += __msa_hadd_u_h(src0, src0);
+ vec1 += __msa_hadd_u_h(src1, src1);
+ vec2 += __msa_hadd_u_h(src2, src2);
+ vec3 += __msa_hadd_u_h(src3, src3);
+ vec0 += __msa_hadd_u_h(src4, src4);
+ vec1 += __msa_hadd_u_h(src5, src5);
+ vec2 += __msa_hadd_u_h(src6, src6);
+ vec3 += __msa_hadd_u_h(src7, src7);
+ reg0 = __msa_hadd_u_w(vec0, vec0);
+ reg1 = __msa_hadd_u_w(vec1, vec1);
+ reg2 = __msa_hadd_u_w(vec2, vec2);
+ reg3 = __msa_hadd_u_w(vec3, vec3);
+ reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+ reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+ reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+ reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst);
+ s += 64;
+ t0 += 64;
+ t1 += 64;
+ t2 += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x, width;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, vec0;
+ v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+ (void)src_stride;
+
+ assert(dst_width % 3 == 0);
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+ dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+ dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+ SD(dst0, dst);
+ SW(dst1, dst + 8);
+ src_ptr += 32;
+ dst += 12;
+ }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, width;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, src2, src3, out;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ v8i16 zero = {0};
+ v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+ v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+ v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+ v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+ vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+ vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+ vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ tmp0 = __msa_hadd_u_w(vec4, vec4);
+ tmp1 = __msa_hadd_u_w(vec5, vec5);
+ tmp2 = __msa_hadd_u_w(vec6, vec6);
+ tmp3 = __msa_hadd_u_w(vec7, vec7);
+ tmp4 = __msa_hadd_u_w(vec0, vec0);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ tmp0 = __msa_hadd_u_w(vec0, vec0);
+ tmp1 = __msa_hadd_u_w(vec1, vec1);
+ tmp0 *= const_0x2AAA;
+ tmp1 *= const_0x2AAA;
+ tmp4 *= const_0x4000;
+ tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+ tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+ tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+ out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+ dst0 = __msa_copy_u_d((v2i64)out, 0);
+ dst1 = __msa_copy_u_w((v4i32)out, 2);
+ SD(dst0, dst_ptr);
+ SW(dst1, dst_ptr + 8);
+ s += 32;
+ t += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, width;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t0 = s + src_stride;
+ const uint8_t* t1 = s + src_stride * 2;
+ uint64_t dst0;
+ uint32_t dst1;
+ v16u8 src0, src1, src2, src3, src4, src5, out;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+ v8u16 zero = {0};
+ v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+ v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+ v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+ v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ width = dst_width / 3;
+
+ for (x = 0; x < width; x += 4) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+ vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+ vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+ vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+ vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+ vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+ vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+ vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+ vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+ vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+ vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+ vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+ vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+ vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+ vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+ tmp0 = __msa_hadd_u_w(vec4, vec4);
+ tmp1 = __msa_hadd_u_w(vec5, vec5);
+ tmp2 = __msa_hadd_u_w(vec6, vec6);
+ tmp3 = __msa_hadd_u_w(vec7, vec7);
+ tmp4 = __msa_hadd_u_w(vec0, vec0);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ tmp0 = __msa_hadd_u_w(vec0, vec0);
+ tmp1 = __msa_hadd_u_w(vec1, vec1);
+ tmp0 *= const_0x1C71;
+ tmp1 *= const_0x1C71;
+ tmp4 *= const_0x2AAA;
+ tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+ tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+ tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+ vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+ out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+ dst0 = __msa_copy_u_d((v2i64)out, 0);
+ dst1 = __msa_copy_u_w((v4i32)out, 2);
+ SD(dst0, dst_ptr);
+ SW(dst1, dst_ptr + 8);
+ s += 32;
+ t0 += 32;
+ t1 += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ int x;
+ v16u8 src0;
+ v8u16 dst0, dst1;
+ v16i8 zero = {0};
+
+ assert(src_width > 0);
+
+ for (x = 0; x < src_width; x += 16) {
+ src0 = LD_UB(src_ptr);
+ dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+ dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+ dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+ dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+ ST_UH2(dst0, dst1, dst_ptr, 8);
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index 44b0c808..9b4dce33 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -23,8 +23,11 @@ extern "C" {
// Provided by Fritz Koenig
// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@@ -43,8 +46,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -66,8 +72,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
@@ -95,8 +103,11 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -113,12 +124,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
+ asm volatile (
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
@@ -155,7 +168,9 @@ asm volatile (
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -175,7 +190,8 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
@@ -234,7 +250,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
@@ -274,21 +291,20 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
}
#define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
- { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
- { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
- { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
- { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
+ 18, 6, 14, 19, 0, 0, 0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
@@ -314,7 +330,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile (
@@ -433,7 +450,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
MEMACCESS(4)
"vld1.16 {q13}, [%4] \n"
@@ -530,8 +548,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
+void ScaleAddRows_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int src_width,
+ int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
@@ -563,6 +584,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
@@ -571,13 +593,17 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
+// clang-format on
-// The NEON version mimics this formula:
+// The NEON version mimics this formula (from row_common.cc):
// #define BLENDER(a, b, f) (uint8)((int)(a) +
-// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
@@ -640,8 +666,10 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
@@ -737,8 +765,11 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
);
}
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@@ -760,8 +791,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -788,8 +822,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@@ -829,8 +865,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"mov r12, %3, lsl #2 \n"
"1: \n"
@@ -856,9 +896,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
@@ -902,17 +944,22 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "vld1.32 {"#dn"["#n"]}, [%6] \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "vld1.32 {" #dn "[" #n "]}, [%6] \n"
+// clang-format on
+
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
int tmp;
const uint8* src_tmp = src_argb;
asm volatile (
@@ -944,17 +991,22 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+// clang-format on
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
index ff277f26..a98b9d03 100644
--- a/files/source/scale_neon64.cc
+++ b/files/source/scale_neon64.cc
@@ -21,8 +21,11 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
// load even pixels into v0, odd into v1
@@ -41,8 +44,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -64,8 +70,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@@ -93,8 +101,11 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -111,12 +122,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride;
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
+ asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
@@ -152,15 +165,17 @@ asm volatile (
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -172,15 +187,16 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
@@ -232,7 +248,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
"add %3, %3, %0 \n"
@@ -273,29 +290,28 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
);
}
-static uvec8 kShuf38 =
- { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
- { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
- { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
- { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
+ 34, 6, 22, 35, 0, 0, 0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n"
"1: \n"
MEMACCESS(0)
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
@@ -312,7 +328,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
@@ -441,7 +458,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+ uint8* dst_ptr,
+ int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
@@ -545,8 +563,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
+void ScaleAddRows_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr,
+ int src_width,
+ int src_height) {
const uint8* src_tmp;
asm volatile (
"1: \n"
@@ -578,23 +599,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA8_LANE(n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "ld2 {v4.b, v5.b}["#n"], [%6] \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
+// clang-format on
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
- int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64) x;
- int64 dx64 = (int64) dx;
+ int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@@ -626,8 +656,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
@@ -654,9 +684,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr, ptrdiff_t src_stride,
- int dst_width, int source_y_fraction) {
- int y_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y_fraction = 256 - source_y_fraction;
asm volatile (
"cmp %w4, #0 \n"
"b.eq 100f \n"
@@ -752,8 +784,11 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
);
}
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
@@ -775,8 +810,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS (0)
@@ -802,8 +840,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst,
+ int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
@@ -839,8 +879,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
+ (void)src_stride;
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -867,9 +911,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
// Alignment requirement: src_argb 4 byte aligned.
// TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+ ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb, int dst_width) {
+ uint8* dst_argb,
+ int dst_width) {
asm volatile (
"add %1, %1, %0 \n"
"1: \n"
@@ -916,21 +962,26 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "ld1 {"#vn".s}["#n"], [%6] \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld1 {" #vn ".s}[" #n "], [%6] \n"
+// clang-format on
+
+void ScaleARGBCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
const uint8* src_tmp = src_argb;
- int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64) x;
- int64 dx64 = (int64) dx;
+ int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
int64 tmp64;
asm volatile (
"1: \n"
@@ -961,23 +1012,28 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
#undef LOAD1_DATA32_LANE
+// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ MEMACCESS(6) \
+ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
+// clang-format on
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
- int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64) x;
- int64 dx64 = (int64) dx;
+ int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
+ int64 x64 = (int64)x;
+ int64 dx64 = (int64)dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
index f1709736..0c5b3a1e 100644
--- a/files/source/scale_win.cc
+++ b/files/source/scale_win.cc
@@ -20,94 +20,89 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-static uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
-static uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
// Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -120,27 +115,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm4, xmm4 // constant 0x0101
+ pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
+ pxor xmm5, xmm5 // constant 0
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
- pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -153,20 +149,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm4, xmm4 // constant 0x0101
+ pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
+ pxor xmm5, xmm5 // constant 0
wloop:
movdqu xmm0, [eax]
@@ -174,15 +171,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add
+ paddw xmm0, xmm2 // vertical add
paddw xmm1, xmm3
psrlw xmm0, 1
psrlw xmm1, 1
- pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
packuswb xmm0, xmm1
movdqu [edx], xmm0
@@ -197,23 +194,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
#ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
+ vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -225,30 +223,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
- vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
- vpxor ymm5, ymm5, ymm5 // constant 0
+ vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
- vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -262,20 +261,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
- vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
- vpxor ymm5, ymm5, ymm5 // constant 0
+ vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax]
@@ -283,18 +283,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vmovdqu ymm2, [eax + esi]
vmovdqu ymm3, [eax + esi + 32]
lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add
+ vpaddw ymm0, ymm0, ymm2 // vertical add
vpaddw ymm1, ymm1, ymm3
- vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
+ vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
vpsrlw ymm1, ymm1, 1
- vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -308,15 +308,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
#endif // HAS_SCALEROWDOWN2_AVX2
// Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
psrld xmm5, 24
pslld xmm5, 16
@@ -339,50 +340,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm4, xmm4 // constant 0x0101
+ pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
movdqa xmm5, xmm4
packuswb xmm4, xmm4
- psllw xmm5, 3 // constant 0x0008
+ psllw xmm5, 3 // constant 0x0008
wloop:
- movdqu xmm0, [eax] // average rows
+ movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
- pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add rows 0, 1
+ paddw xmm0, xmm2 // vertical add rows 0, 1
paddw xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 2
+ paddw xmm0, xmm2 // add row 2
paddw xmm1, xmm3
movdqu xmm2, [eax + edi]
movdqu xmm3, [eax + edi + 16]
lea eax, [eax + 32]
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 3
+ paddw xmm0, xmm2 // add row 3
paddw xmm1, xmm3
phaddw xmm0, xmm1
- paddw xmm0, xmm5 // + 8 for round
- psrlw xmm0, 4 // /16 for average of 4 * 4
+ paddw xmm0, xmm5 // + 8 for round
+ psrlw xmm0, 4 // /16 for average of 4 * 4
packuswb xmm0, xmm0
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
@@ -397,15 +399,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
#ifdef HAS_SCALEROWDOWN4_AVX2
// Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
vpsrld ymm5, ymm5, 24
vpslld ymm5, ymm5, 16
@@ -416,10 +419,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpand ymm0, ymm0, ymm5
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -431,52 +434,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
vpsrlw ymm4, ymm4, 15
- vpsllw ymm5, ymm4, 3 // constant 0x0008
+ vpsllw ymm5, ymm4, 3 // constant 0x0008
vpackuswb ymm4, ymm4, ymm4
wloop:
- vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + esi]
vmovdqu ymm3, [eax + esi + 32]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
vpaddw ymm1, ymm1, ymm3
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 2
+ vpaddw ymm0, ymm0, ymm2 // add row 2
vpaddw ymm1, ymm1, ymm3
vmovdqu ymm2, [eax + edi]
vmovdqu ymm3, [eax + edi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 3
+ vpaddw ymm0, ymm0, ymm2 // add row 3
vpaddw ymm1, ymm1, ymm3
- vphaddw ymm0, ymm0, ymm1 // mutates
- vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
- vpaddw ymm0, ymm0, ymm5 // + 8 for round
- vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
+ vphaddw ymm0, ymm0, ymm1 // mutates
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
vpackuswb ymm0, ymm0, ymm0
- vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -494,14 +498,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
movdqa xmm3, xmmword ptr kShuf0
movdqa xmm4, xmmword ptr kShuf1
movdqa xmm5, xmmword ptr kShuf2
@@ -541,16 +546,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// xmm7 kRound34
// Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, xmmword ptr kShuf21
@@ -559,7 +564,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm7, xmmword ptr kRound34
wloop:
- movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi]
pavgb xmm0, xmm1
pshufb xmm0, xmm2
@@ -568,7 +573,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm0, [eax + 8] // pixels 8..15
movdqu xmm1, [eax + esi + 8]
pavgb xmm0, xmm1
pshufb xmm0, xmm3
@@ -577,7 +582,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm0, [eax + 16] // pixels 16..23
movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm1
@@ -598,16 +603,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
}
// Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, xmmword ptr kShuf21
@@ -616,7 +621,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm7, xmmword ptr kRound34
wloop:
- movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
@@ -626,7 +631,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm0, [eax + 8] // pixels 8..15
movdqu xmm1, [eax + esi + 8]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
@@ -636,7 +641,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
- movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm0, [eax + 16] // pixels 16..23
movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm1, xmm0
@@ -660,26 +665,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
// 3/8 point sampler
// Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
movdqa xmm4, xmmword ptr kShuf38a
movdqa xmm5, xmmword ptr kShuf38b
xloop:
- movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
lea eax, [eax + 32]
pshufb xmm0, xmm4
pshufb xmm1, xmm5
paddusb xmm0, xmm1
- movq qword ptr [edx], xmm0 // write 12 pixels
+ movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
@@ -691,23 +697,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShufAc
movdqa xmm3, xmmword ptr kShufAc3
movdqa xmm4, xmmword ptr kScaleAc33
pxor xmm5, xmm5
xloop:
- movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
movdqu xmm6, [eax + esi]
movhlps xmm1, xmm0
movhlps xmm7, xmm6
@@ -725,14 +731,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
paddusw xmm0, xmm6
paddusw xmm1, xmm7
- movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
psrldq xmm0, 2
paddusw xmm6, xmm0
psrldq xmm0, 2
paddusw xmm6, xmm0
pshufb xmm6, xmm2
- movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
psrldq xmm1, 2
paddusw xmm7, xmm1
psrldq xmm1, 2
@@ -740,10 +746,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pshufb xmm7, xmm3
paddusw xmm6, xmm7
- pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6
- movd [edx], xmm6 // write 6 pixels
+ movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16
movd [edx + 2], xmm6
lea edx, [edx + 6]
@@ -756,28 +762,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
}
// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShufAb0
movdqa xmm3, xmmword ptr kShufAb1
movdqa xmm4, xmmword ptr kShufAb2
movdqa xmm5, xmmword ptr kScaleAb2
xloop:
- movdqu xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm0, [eax] // average 2 rows into xmm0
movdqu xmm1, [eax + esi]
lea eax, [eax + 16]
pavgb xmm0, xmm1
- movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
pshufb xmm1, xmm2
movdqa xmm6, xmm0
pshufb xmm6, xmm3
@@ -785,10 +791,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pshufb xmm0, xmm4
paddusw xmm1, xmm0
- pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1
- movd [edx], xmm1 // write 6 pixels
+ movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16
movd [edx + 2], xmm1
lea edx, [edx + 6]
@@ -801,26 +807,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
}
// Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- mov edx, [esp + 8] // dst_ptr
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
- // sum rows
+ // sum rows
xloop:
- movdqu xmm3, [eax] // read 16 bytes
+ movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
- movdqu xmm0, [edx] // read 16 words from destination
+ movdqu xmm0, [edx] // read 16 words from destination
movdqu xmm1, [edx + 16]
movdqa xmm2, xmm3
punpcklbw xmm2, xmm5
punpckhbw xmm3, xmm5
- paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3
- movdqu [edx], xmm0 // write 16 words to destination
+ movdqu [edx], xmm0 // write 16 words to destination
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 16
@@ -831,24 +838,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
+ uint16* dst_ptr,
+ int src_width) {
__asm {
- mov eax, [esp + 4] // src_ptr
- mov edx, [esp + 8] // dst_ptr
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
- // sum rows
+ // sum rows
xloop:
- vmovdqu ymm3, [eax] // read 32 bytes
+ vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5
- vpaddusw ymm0, ymm2, [edx] // sum 16 words
+ vpaddusw ymm0, ymm2, [edx] // sum 16 words
vpaddusw ymm1, ymm3, [edx + 32]
- vmovdqu [edx], ymm0 // write 32 words to destination
+ vmovdqu [edx], ymm0 // write 32 words to destination
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 32
@@ -862,86 +870,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
-static uvec8 kFsub80 =
- { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
- { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
__asm {
push ebx
push esi
push edi
- mov edi, [esp + 12 + 4] // dst_ptr
- mov esi, [esp + 12 + 8] // src_ptr
- mov ecx, [esp + 12 + 12] // dst_width
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
movd xmm2, [esp + 12 + 16] // x
movd xmm3, [esp + 12 + 20] // dx
- mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
movd xmm5, eax
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
- pcmpeqb xmm7, xmm7 // generate 0x0001
+ pcmpeqb xmm7, xmm7 // generate 0x0001
psrlw xmm7, 15
- pextrw eax, xmm2, 1 // get x0 integer. preroll
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
- movdqa xmm0, xmm2 // x1 = x0 + dx
+ movdqa xmm0, xmm2 // x1 = x0 + dx
paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
- psrlw xmm1, 9 // 7 bit fractions.
+ psrlw xmm1, 9 // 7 bit fractions.
movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
movd xmm4, ebx
- pshufb xmm1, xmm5 // 0011
+ pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
- pxor xmm1, xmm6 // 0..7f and 7f..0
- paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm1, xmm1 // 8 bits, 2 pixels.
+ psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm1, xmm1 // 8 bits, 2 pixels.
movd ebx, xmm1
mov [edi], bx
lea edi, [edi + 2]
- sub ecx, 2 // 2 pixels
+ sub ecx, 2 // 2 pixels
jge xloop2
xloop29:
add ecx, 2 - 1
jl xloop99
- // 1 pixel remainder
+ // 1 pixel remainder
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
- psrlw xmm2, 9 // 7 bit fractions.
- pshufb xmm2, xmm5 // 0011
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
- pxor xmm2, xmm6 // 0..7f and 7f..0
- paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm2, xmm0 // 16 bit
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm2, xmm0 // 16 bit
paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm2, xmm2 // 8 bits
+ psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm2, xmm2 // 8 bits
movd ebx, xmm2
mov [edi], bl
@@ -955,13 +964,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
+__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
+ const uint8* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
__asm {
- mov edx, [esp + 4] // dst_ptr
- mov eax, [esp + 8] // src_ptr
- mov ecx, [esp + 12] // dst_width
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
wloop:
movdqu xmm0, [eax]
@@ -980,15 +991,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
wloop:
movdqu xmm0, [eax]
@@ -1005,23 +1016,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
}
// Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
__asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
movdqu [edx], xmm0
lea edx, [edx + 16]
@@ -1033,16 +1044,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
}
// Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb,
+ int dst_width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // dst_width
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
wloop:
movdqu xmm0, [eax]
@@ -1050,11 +1061,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
+ pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
movdqu [edx], xmm0
lea edx, [edx + 16]
@@ -1067,18 +1078,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
}
// Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
__asm {
push ebx
push edi
- mov eax, [esp + 8 + 4] // src_argb
- // src_stride ignored
- mov ebx, [esp + 8 + 12] // src_stepx
- mov edx, [esp + 8 + 16] // dst_argb
- mov ecx, [esp + 8 + 20] // dst_width
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
@@ -1103,21 +1115,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
}
// Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb,
+ int dst_width) {
__asm {
push ebx
push esi
push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov esi, [esp + 12 + 8] // src_stride
- mov ebx, [esp + 12 + 12] // src_stepx
- mov edx, [esp + 12 + 16] // dst_argb
- mov ecx, [esp + 12 + 20] // dst_width
- lea esi, [eax + esi] // row1 pointer
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
@@ -1132,11 +1144,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
movq xmm3, qword ptr [esi + ebx * 2]
movhps xmm3, qword ptr [esi + edi]
lea esi, [esi + ebx * 4]
- pavgb xmm0, xmm2 // average rows
+ pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
movdqu [edx], xmm0
lea edx, [edx + 16]
@@ -1151,29 +1163,31 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
}
// Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
__asm {
push edi
push esi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx
- pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
- pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
paddd xmm2, xmm0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 2
- pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
- paddd xmm2, xmm0 // x3 x2 x1 x0
- paddd xmm3, xmm3 // 0, 0, 0, dx * 4
- pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
- pextrw eax, xmm2, 1 // get x0 integer.
- pextrw edx, xmm2, 3 // get x1 integer.
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
cmp ecx, 0
jle xloop99
@@ -1184,20 +1198,20 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- pextrw edx, xmm2, 7 // get x3 integer.
- paddd xmm2, xmm3 // x += dx
- punpckldq xmm0, xmm1 // x0 x1
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
movd xmm1, [esi + eax * 4] // 1 source x2 pixels
movd xmm4, [esi + edx * 4] // 1 source x3 pixels
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- punpckldq xmm1, xmm4 // x2 x3
- punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
movdqu [edi], xmm0
lea edi, [edi + 16]
- sub ecx, 4 // 4 pixels
+ sub ecx, 4 // 4 pixels
jge xloop4
xloop49:
@@ -1207,8 +1221,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
- pextrw eax, xmm2, 5 // get x2 integer.
- punpckldq xmm0, xmm1 // x0 x1
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
@@ -1233,59 +1247,61 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
__asm {
push esi
push edi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx
movdqa xmm4, xmmword ptr kShuffleColARGB
movdqa xmm5, xmmword ptr kShuffleFractions
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
- pextrw eax, xmm2, 1 // get x0 integer. preroll
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
- movdqa xmm0, xmm2 // x1 = x0 + dx
+ movdqa xmm0, xmm2 // x1 = x0 + dx
paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- psrlw xmm1, 9 // 7 bit fractions.
+ psrlw xmm1, 9 // 7 bit fractions.
movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
- pshufb xmm1, xmm5 // 0000000011111111
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm1, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
- packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
movq qword ptr [edi], xmm0
lea edi, [edi + 8]
- sub ecx, 2 // 2 pixels
+ sub ecx, 2 // 2 pixels
jge xloop2
xloop29:
@@ -1293,15 +1309,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
add ecx, 2 - 1
jl xloop99
- // 1 pixel remainder
- psrlw xmm2, 9 // 7 bit fractions.
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- pshufb xmm2, xmm5 // 00000000
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm2, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
psrlw xmm0, 7
- packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0
xloop99:
@@ -1313,13 +1329,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+ const uint8* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
__asm {
- mov edx, [esp + 4] // dst_argb
- mov eax, [esp + 8] // src_argb
- mov ecx, [esp + 12] // dst_width
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
wloop:
movdqu xmm0, [eax]
@@ -1338,12 +1356,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
}
// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
+__declspec(naked) int FixedDiv_X86(int num, int div) {
__asm {
- mov eax, [esp + 4] // num
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
shl eax, 16
idiv dword ptr [esp + 8]
ret
@@ -1351,13 +1368,12 @@ int FixedDiv_X86(int num, int div) {
}
// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
__asm {
- mov eax, [esp + 4] // num
- mov ecx, [esp + 8] // denom
- cdq // extend num to 64 bits
- shld edx, eax, 16 // 32.16
+ mov eax, [esp + 4] // num
+ mov ecx, [esp + 8] // denom
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
shl eax, 16
sub eax, 0x00010001
sbb edx, 0
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 00fb71e1..3e9c6a29 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "libyuv/video_common.h"
#ifdef __cplusplus
@@ -24,24 +23,24 @@ struct FourCCAliasEntry {
};
static const struct FourCCAliasEntry kFourCCAliases[] = {
- {FOURCC_IYUV, FOURCC_I420},
- {FOURCC_YU12, FOURCC_I420},
- {FOURCC_YU16, FOURCC_I422},
- {FOURCC_YU24, FOURCC_I444},
- {FOURCC_YUYV, FOURCC_YUY2},
- {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
- {FOURCC_HDYC, FOURCC_UYVY},
- {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
- {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
- {FOURCC_DMB1, FOURCC_MJPG},
- {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
- {FOURCC_RGB3, FOURCC_RAW },
- {FOURCC_BGR3, FOURCC_24BG},
- {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
- {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
- {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
- {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
- {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
+ {FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU12, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
+ {FOURCC_YUYV, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
+ {FOURCC_HDYC, FOURCC_UYVY},
+ {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
+ {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
+ {FOURCC_RGB3, FOURCC_RAW},
+ {FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
+ {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
+ {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
+ {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
+ {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
};
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
@@ -62,4 +61,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
} // extern "C"
} // namespace libyuv
#endif
-
diff --git a/files/tools_libyuv/OWNERS b/files/tools_libyuv/OWNERS
new file mode 100644
index 00000000..aca046d4
--- /dev/null
+++ b/files/tools_libyuv/OWNERS
@@ -0,0 +1 @@
+kjellander@chromium.org
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
new file mode 100755
index 00000000..a9eb307e
--- /dev/null
+++ b/files/tools_libyuv/autoroller/roll_deps.py
@@ -0,0 +1,482 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a modified copy of the script in
+# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py
+# customized for libyuv.
+
+
+"""Script to automatically roll dependencies in the libyuv DEPS file."""
+
+import argparse
+import base64
+import collections
+import logging
+import os
+import re
+import subprocess
+import sys
+import urllib
+
+
+# Skip these dependencies (list without solution name prefix).
+DONT_AUTOROLL_THESE = [
+ 'src/third_party/gflags/src',
+]
+
+LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
+CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
+CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
+CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
+CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
+
+COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
+ROLL_BRANCH_NAME = 'roll_chromium_revision'
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
+ os.pardir))
+CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
+
+sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
+import find_depot_tools
+find_depot_tools.add_depot_tools_to_path()
+from gclient import GClientKeywords
+
+CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
+CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
+ 'clang', 'scripts', 'update.py')
+
+DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
+ChangedDep = collections.namedtuple('ChangedDep',
+ 'path url current_rev new_rev')
+
+class RollError(Exception):
+ pass
+
+
+def ParseDepsDict(deps_content):
+ local_scope = {}
+ var = GClientKeywords.VarImpl({}, local_scope)
+ global_scope = {
+ 'From': GClientKeywords.FromImpl,
+ 'Var': var.Lookup,
+ 'deps_os': {},
+ }
+ exec(deps_content, global_scope, local_scope)
+ return local_scope
+
+
+def ParseLocalDepsFile(filename):
+ with open(filename, 'rb') as f:
+ deps_content = f.read()
+ return ParseDepsDict(deps_content)
+
+
+def ParseRemoteCrDepsFile(revision):
+ deps_content = ReadRemoteCrFile('DEPS', revision)
+ return ParseDepsDict(deps_content)
+
+
+def ParseCommitPosition(commit_message):
+ for line in reversed(commit_message.splitlines()):
+ m = COMMIT_POSITION_RE.match(line.strip())
+ if m:
+ return m.group(1)
+ logging.error('Failed to parse commit position id from:\n%s\n',
+ commit_message)
+ sys.exit(-1)
+
+
+def _RunCommand(command, working_dir=None, ignore_exit_code=False,
+ extra_env=None):
+ """Runs a command and returns the output from that command.
+
+ If the command fails (exit code != 0), the function will exit the process.
+
+ Returns:
+ A tuple containing the stdout and stderr outputs as strings.
+ """
+ working_dir = working_dir or CHECKOUT_SRC_DIR
+ logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
+ env = os.environ.copy()
+ if extra_env:
+ assert all(type(value) == str for value in extra_env.values())
+ logging.debug('extra env: %s', extra_env)
+ env.update(extra_env)
+ p = subprocess.Popen(command, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, env=env,
+ cwd=working_dir, universal_newlines=True)
+ std_output = p.stdout.read()
+ err_output = p.stderr.read()
+ p.wait()
+ p.stdout.close()
+ p.stderr.close()
+ if not ignore_exit_code and p.returncode != 0:
+ logging.error('Command failed: %s\n'
+ 'stdout:\n%s\n'
+ 'stderr:\n%s\n', ' '.join(command), std_output, err_output)
+ sys.exit(p.returncode)
+ return std_output, err_output
+
+
+def _GetBranches():
+ """Returns a tuple of active,branches.
+
+ The 'active' is the name of the currently active branch and 'branches' is a
+ list of all branches.
+ """
+ lines = _RunCommand(['git', 'branch'])[0].split('\n')
+ branches = []
+ active = ''
+ for line in lines:
+ if '*' in line:
+ # The assumption is that the first char will always be the '*'.
+ active = line[1:].strip()
+ branches.append(active)
+ else:
+ branch = line.strip()
+ if branch:
+ branches.append(branch)
+ return active, branches
+
+
+def _ReadGitilesContent(url):
+ # Download and decode BASE64 content until
+ # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
+ base64_content = ReadUrlContent(url + '?format=TEXT')
+ return base64.b64decode(base64_content[0])
+
+
+def ReadRemoteCrFile(path_below_src, revision):
+ """Reads a remote Chromium file of a specific revision. Returns a string."""
+ return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
+ path_below_src))
+
+
+def ReadRemoteCrCommit(revision):
+ """Reads a remote Chromium commit message. Returns a string."""
+ return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
+
+
+def ReadUrlContent(url):
+ """Connect to a remote host and read the contents. Returns a list of lines."""
+ conn = urllib.urlopen(url)
+ try:
+ return conn.readlines()
+ except IOError as e:
+ logging.exception('Error connecting to %s. Error: %s', url, e)
+ raise
+ finally:
+ conn.close()
+
+
+def GetMatchingDepsEntries(depsentry_dict, dir_path):
+ """Gets all deps entries matching the provided path.
+
+ This list may contain more than one DepsEntry object.
+ Example: dir_path='src/testing' would give results containing both
+ 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
+ Example 2: dir_path='src/build' should return 'src/build' but not
+ 'src/buildtools'.
+
+ Returns:
+ A list of DepsEntry objects.
+ """
+ result = []
+ for path, depsentry in depsentry_dict.iteritems():
+ if path == dir_path:
+ result.append(depsentry)
+ else:
+ parts = path.split('/')
+ if all(part == parts[i]
+ for i, part in enumerate(dir_path.split('/'))):
+ result.append(depsentry)
+ return result
+
+
+def BuildDepsentryDict(deps_dict):
+ """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict."""
+ result = {}
+ def AddDepsEntries(deps_subdict):
+ for path, deps_url in deps_subdict.iteritems():
+ if not result.has_key(path):
+ url, revision = deps_url.split('@') if deps_url else (None, None)
+ result[path] = DepsEntry(path, url, revision)
+
+ AddDepsEntries(deps_dict['deps'])
+ for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
+ AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+ return result
+
+
+def CalculateChangedDeps(libyuv_deps, new_cr_deps):
+ """
+ Calculate changed deps entries based on entries defined in the libyuv DEPS
+ file:
+ - If a shared dependency with the Chromium DEPS file: roll it to the same
+ revision as Chromium (i.e. entry in the new_cr_deps dict)
+ - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
+ this means it may be ahead of the chromium_revision, but generally these
+ should be close).
+ - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
+ unless it's configured to be skipped.
+
+ Returns:
+ A list of ChangedDep objects representing the changed deps.
+ """
+ result = []
+ libyuv_entries = BuildDepsentryDict(libyuv_deps)
+ new_cr_entries = BuildDepsentryDict(new_cr_deps)
+ for path, libyuv_deps_entry in libyuv_entries.iteritems():
+ if path in DONT_AUTOROLL_THESE:
+ continue
+ cr_deps_entry = new_cr_entries.get(path)
+ if cr_deps_entry:
+ # Use the revision from Chromium's DEPS file.
+ new_rev = cr_deps_entry.revision
+ assert libyuv_deps_entry.url == cr_deps_entry.url, (
+ 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
+ (path, libyuv_deps_entry.url, cr_deps_entry.url))
+ else:
+ # Use the HEAD of the deps repo.
+ stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
+ 'HEAD'])
+ new_rev = stdout.strip().split('\t')[0]
+
+ # Check if an update is necessary.
+ if libyuv_deps_entry.revision != new_rev:
+ logging.debug('Roll dependency %s to %s', path, new_rev)
+ result.append(ChangedDep(path, libyuv_deps_entry.url,
+ libyuv_deps_entry.revision, new_rev))
+ return sorted(result)
+
+
+def CalculateChangedClang(new_cr_rev):
+ def GetClangRev(lines):
+ for line in lines:
+ match = CLANG_REVISION_RE.match(line)
+ if match:
+ return match.group(1)
+ raise RollError('Could not parse Clang revision!')
+
+ with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
+ current_lines = f.readlines()
+ current_rev = GetClangRev(current_lines)
+
+ new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
+ new_cr_rev).splitlines()
+ new_rev = GetClangRev(new_clang_update_py)
+ return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
+
+
+def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
+ new_commit_pos, changed_deps_list, clang_change):
+ current_cr_rev = current_cr_rev[0:10]
+ new_cr_rev = new_cr_rev[0:10]
+ rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
+ git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
+
+ commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
+ git_number_interval)]
+ commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
+ commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
+ rev_interval))
+ # TBR field will be empty unless in some custom cases, where some engineers
+ # are added.
+ tbr_authors = ''
+ if changed_deps_list:
+ commit_msg.append('Changed dependencies:')
+
+ for c in changed_deps_list:
+ commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
+ c.current_rev[0:10],
+ c.new_rev[0:10]))
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
+ commit_msg.append('DEPS diff: %s\n' % change_url)
+ else:
+ commit_msg.append('No dependencies changed.')
+
+ if clang_change.current_rev != clang_change.new_rev:
+ commit_msg.append('Clang version changed %s:%s' %
+ (clang_change.current_rev, clang_change.new_rev))
+ change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
+ CLANG_UPDATE_SCRIPT_URL_PATH)
+ commit_msg.append('Details: %s\n' % change_url)
+ else:
+ commit_msg.append('No update to Clang.\n')
+
+ commit_msg.append('TBR=%s' % tbr_authors)
+ commit_msg.append('BUG=None')
+ return '\n'.join(commit_msg)
+
+
+def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
+ changed_deps):
+ """Update the DEPS file with the new revision."""
+
+ # Update the chromium_revision variable.
+ with open(deps_filename, 'rb') as deps_file:
+ deps_content = deps_file.read()
+ deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
+ with open(deps_filename, 'wb') as deps_file:
+ deps_file.write(deps_content)
+
+ # Update each individual DEPS entry.
+ for dep in changed_deps:
+ local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
+ if not os.path.isdir(local_dep_dir):
+ raise RollError(
+ 'Cannot find local directory %s. Either run\n'
+ 'gclient sync --deps=all\n'
+ 'or make sure the .gclient file for your solution contains all '
+ 'platforms in the target_os list, i.e.\n'
+ 'target_os = ["android", "unix", "mac", "ios", "win"];\n'
+ 'Then run "gclient sync" again.' % local_dep_dir)
+ _, stderr = _RunCommand(
+ ['roll-dep-svn', '--no-verify-revision', dep.path, dep.new_rev],
+ working_dir=CHECKOUT_SRC_DIR, ignore_exit_code=True)
+ if stderr:
+ logging.warning('roll-dep-svn: %s', stderr)
+
+
+def _IsTreeClean():
+ stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
+ if len(stdout) == 0:
+ return True
+
+ logging.error('Dirty/unversioned files:\n%s', stdout)
+ return False
+
+
+def _EnsureUpdatedMasterBranch(dry_run):
+ current_branch = _RunCommand(
+ ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
+ if current_branch != 'master':
+ logging.error('Please checkout the master branch and re-run this script.')
+ if not dry_run:
+ sys.exit(-1)
+
+ logging.info('Updating master branch...')
+ _RunCommand(['git', 'pull'])
+
+
+def _CreateRollBranch(dry_run):
+ logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
+
+
+def _RemovePreviousRollBranch(dry_run):
+ active_branch, branches = _GetBranches()
+ if active_branch == ROLL_BRANCH_NAME:
+ active_branch = 'master'
+ if ROLL_BRANCH_NAME in branches:
+ logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
+ if not dry_run:
+ _RunCommand(['git', 'checkout', active_branch])
+ _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
+
+
+def _LocalCommit(commit_msg, dry_run):
+ logging.info('Committing changes locally.')
+ if not dry_run:
+ _RunCommand(['git', 'add', '--update', '.'])
+ _RunCommand(['git', 'commit', '-m', commit_msg])
+
+
+def _UploadCL(dry_run, rietveld_email=None):
+ logging.info('Uploading CL...')
+ if not dry_run:
+ cmd = ['git', 'cl', 'upload', '-f']
+ if rietveld_email:
+ cmd.append('--email=%s' % rietveld_email)
+ _RunCommand(cmd, extra_env={'EDITOR': 'true'})
+
+
+def _SendToCQ(dry_run, skip_cq):
+ logging.info('Sending the CL to the CQ...')
+ if not dry_run and not skip_cq:
+ _RunCommand(['git', 'cl', 'set_commit'])
+ logging.info('Sent the CL to the CQ.')
+
+
+def main():
+ p = argparse.ArgumentParser()
+ p.add_argument('--clean', action='store_true', default=False,
+ help='Removes any previous local roll branch.')
+ p.add_argument('-r', '--revision',
+ help=('Chromium Git revision to roll to. Defaults to the '
+ 'Chromium HEAD revision if omitted.'))
+ p.add_argument('-u', '--rietveld-email',
+ help=('E-mail address to use for creating the CL at Rietveld'
+ 'If omitted a previously cached one will be used or an '
+ 'error will be thrown during upload.'))
+ p.add_argument('--dry-run', action='store_true', default=False,
+ help=('Calculate changes and modify DEPS, but don\'t create '
+ 'any local branch, commit, upload CL or send any '
+ 'tryjobs.'))
+ p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
+ default=False,
+ help=('Ignore if the current branch is not master or if there '
+ 'are uncommitted changes (default: %(default)s).'))
+ p.add_argument('--skip-cq', action='store_true', default=False,
+ help='Skip sending the CL to the CQ (default: %(default)s)')
+ p.add_argument('-v', '--verbose', action='store_true', default=False,
+ help='Be extra verbose in printing of log messages.')
+ opts = p.parse_args()
+
+ if opts.verbose:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ if not opts.ignore_unclean_workdir and not _IsTreeClean():
+ logging.error('Please clean your local checkout first.')
+ return 1
+
+ if opts.clean:
+ _RemovePreviousRollBranch(opts.dry_run)
+
+ if not opts.ignore_unclean_workdir:
+ _EnsureUpdatedMasterBranch(opts.dry_run)
+
+ new_cr_rev = opts.revision
+ if not new_cr_rev:
+ stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+ head_rev = stdout.strip().split('\t')[0]
+ logging.info('No revision specified. Using HEAD: %s', head_rev)
+ new_cr_rev = head_rev
+
+ deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
+ libyuv_deps = ParseLocalDepsFile(deps_filename)
+ current_cr_rev = libyuv_deps['vars']['chromium_revision']
+
+ current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
+ new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
+
+ new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
+ changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+ clang_change = CalculateChangedClang(new_cr_rev)
+ commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
+ current_commit_pos, new_commit_pos,
+ changed_deps, clang_change)
+ logging.debug('Commit message:\n%s', commit_msg)
+
+ _CreateRollBranch(opts.dry_run)
+ UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
+ _LocalCommit(commit_msg, opts.dry_run)
+ _UploadCL(opts.dry_run, opts.rietveld_email)
+ _SendToCQ(opts.dry_run, opts.skip_cq)
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/.DS_Store b/files/tools_libyuv/autoroller/unittests/.DS_Store
new file mode 100644
index 00000000..70369d69
--- /dev/null
+++ b/files/tools_libyuv/autoroller/unittests/.DS_Store
Binary files differ
diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
new file mode 100755
index 00000000..025e46e1
--- /dev/null
+++ b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+# Copyright 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import glob
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
+sys.path.append(PARENT_DIR)
+import roll_deps
+from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \
+ ParseDepsDict, ParseLocalDepsFile, UpdateDepsFile
+
+
+TEST_DATA_VARS = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+}
+
+DEPS_ENTRIES = {
+ 'src/build': 'https://build.com',
+ 'src/buildtools': 'https://buildtools.com',
+ 'src/testing/gtest': 'https://gtest.com',
+ 'src/testing/gmock': 'https://gmock.com',
+}
+
+BUILD_OLD_REV = '52f7afeca991d96d68cf0507e20dbdd5b845691f'
+BUILD_NEW_REV = 'HEAD'
+BUILDTOOLS_OLD_REV = '64e38f0cebdde27aa0cfb405f330063582f9ac76'
+BUILDTOOLS_NEW_REV = '55ad626b08ef971fd82a62b7abb325359542952b'
+
+
+class TestError(Exception):
+ pass
+
+
+class FakeCmd(object):
+ def __init__(self):
+ self.expectations = []
+
+ def add_expectation(self, *args, **kwargs):
+ returns = kwargs.pop('_returns', None)
+ self.expectations.append((args, kwargs, returns))
+
+ def __call__(self, *args, **kwargs):
+ if not self.expectations:
+ raise TestError('Got unexpected\n%s\n%s' % (args, kwargs))
+ exp_args, exp_kwargs, exp_returns = self.expectations.pop(0)
+ if args != exp_args or kwargs != exp_kwargs:
+ message = 'Expected:\n args: %s\n kwargs: %s\n' % (exp_args, exp_kwargs)
+ message += 'Got:\n args: %s\n kwargs: %s\n' % (args, kwargs)
+ raise TestError(message)
+ return exp_returns
+
+
+class TestRollChromiumRevision(unittest.TestCase):
+ def setUp(self):
+ self._output_dir = tempfile.mkdtemp()
+ for test_file in glob.glob(os.path.join(SCRIPT_DIR, 'testdata', '*')):
+ shutil.copy(test_file, self._output_dir)
+ self._libyuv_depsfile = os.path.join(self._output_dir, 'DEPS')
+ self._old_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.old')
+ self._new_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.new')
+
+ self.fake = FakeCmd()
+ self.old_RunCommand = getattr(roll_deps, '_RunCommand')
+ setattr(roll_deps, '_RunCommand', self.fake)
+
+ def tearDown(self):
+ shutil.rmtree(self._output_dir, ignore_errors=True)
+ self.assertEqual(self.fake.expectations, [])
+ setattr(roll_deps, '_RunCommand', self.old_RunCommand)
+
+ def testUpdateDepsFile(self):
+ new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111'
+
+ current_rev = TEST_DATA_VARS['chromium_revision']
+ UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, [])
+ with open(self._libyuv_depsfile) as deps_file:
+ deps_contents = deps_file.read()
+ self.assertTrue(new_rev in deps_contents,
+ 'Failed to find %s in\n%s' % (new_rev, deps_contents))
+
+ def testParseDepsDict(self):
+ with open(self._libyuv_depsfile) as deps_file:
+ deps_contents = deps_file.read()
+ local_scope = ParseDepsDict(deps_contents)
+ vars_dict = local_scope['vars']
+
+ def assertVar(variable_name):
+ self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
+ assertVar('chromium_git')
+ assertVar('chromium_revision')
+ self.assertEquals(len(local_scope['deps']), 3)
+
+ def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest')
+ self.assertEquals(len(entries), 1)
+ self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest'])
+
+ def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing')
+ self.assertEquals(len(entries), 2)
+
+ def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self):
+ entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build')
+ self.assertEquals(len(entries), 1)
+ self.assertEquals(entries[0], DEPS_ENTRIES['src/build'])
+
+ def testCalculateChangedDeps(self):
+ _SetupGitLsRemoteCall(self.fake,
+ 'https://chromium.googlesource.com/chromium/src/build', BUILD_NEW_REV)
+ libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile)
+ new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile)
+ changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+ self.assertEquals(len(changed_deps), 2)
+ self.assertEquals(changed_deps[0].path, 'src/build')
+ self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV)
+ self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV)
+
+ self.assertEquals(changed_deps[1].path, 'src/buildtools')
+ self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
+ self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
+
+
+def _SetupGitLsRemoteCall(cmd_fake, url, revision):
+ cmd = ['git', 'ls-remote', url, revision]
+ cmd_fake.add_expectation(cmd, _returns=(revision, None))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS b/files/tools_libyuv/autoroller/unittests/testdata/DEPS
new file mode 100644
index 00000000..9fbb48a7
--- /dev/null
+++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS
@@ -0,0 +1,20 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+ 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+}
+
+deps = {
+ # Entry that is a directory in Chromium, so we're using a Git subtree mirror for it.
+ 'src/build':
+ Var('chromium_git') + '/chromium/src/build' + '@' + '52f7afeca991d96d68cf0507e20dbdd5b845691f',
+
+ # Entry that's also a DEPS entry in the Chromium DEPS file.
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + '64e38f0cebdde27aa0cfb405f330063582f9ac76',
+
+ # Entry only present in libyuv, not Chromium.
+ 'src/third_party/gflags/src':
+ Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
+}
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
new file mode 100644
index 00000000..d53083ce
--- /dev/null
+++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
@@ -0,0 +1,13 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+
+ # This is updated compared to the DEPS.chromium.old file.
+ 'buildtools_revision': '55ad626b08ef971fd82a62b7abb325359542952b',
+}
+
+deps = {
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'),
+}
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
new file mode 100644
index 00000000..dd6ddaec
--- /dev/null
+++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
@@ -0,0 +1,13 @@
+# DEPS file for unit tests.
+
+vars = {
+ 'chromium_git': 'https://chromium.googlesource.com',
+
+ # This is and older revision than DEPS.chromium.new file.
+ 'buildtools_revision': '64e38f0cebdde27aa0cfb405f330063582f9ac76',
+}
+
+deps = {
+ 'src/buildtools':
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'),
+}
diff --git a/files/tools_libyuv/get_landmines.py b/files/tools_libyuv/get_landmines.py
new file mode 100755
index 00000000..3dc78bb9
--- /dev/null
+++ b/files/tools_libyuv/get_landmines.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+This file emits the list of reasons why a particular build needs to be clobbered
+(or a list of 'landmines').
+"""
+
+import os
+import sys
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+checkout_root = os.path.abspath(os.path.join(script_dir, os.pardir))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import landmine_utils
+
+
+distributor = landmine_utils.distributor
+gyp_defines = landmine_utils.gyp_defines
+gyp_msvs_version = landmine_utils.gyp_msvs_version
+platform = landmine_utils.platform
+
+
+def print_landmines():
+ """
+ ALL LANDMINES ARE EMITTED FROM HERE.
+ """
+ # DO NOT add landmines as part of a regular CL. Landmines are a last-effort
+ # bandaid fix if a CL that got landed has a build dependency bug and all bots
+ # need to be cleaned up. If you're writing a new CL that causes build
+ # dependency problems, fix the dependency problems instead of adding a
+ # landmine.
+ # See the Chromium version in src/build/get_landmines.py for usage examples.
+ print 'Clobber to remove GYP artifacts after switching bots to GN.'
+ print 'Another try to remove GYP artifacts after switching bots to GN.'
+
+
+def main():
+ print_landmines()
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/files/tools_libyuv/msan/OWNERS b/files/tools_libyuv/msan/OWNERS
new file mode 100644
index 00000000..60351e7e
--- /dev/null
+++ b/files/tools_libyuv/msan/OWNERS
@@ -0,0 +1,3 @@
+pbos@chromium.org
+kjellander@chromium.org
+
diff --git a/files/tools_libyuv/msan/blacklist.txt b/files/tools_libyuv/msan/blacklist.txt
new file mode 100644
index 00000000..8b5e42a7
--- /dev/null
+++ b/files/tools_libyuv/msan/blacklist.txt
@@ -0,0 +1,9 @@
+# The rules in this file are only applied at compile time.
+# Because the Chrome buildsystem does not automatically touch the files
+# mentioned here, changing this file requires clobbering all MSan bots.
+#
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
diff --git a/files/tools_libyuv/ubsan/OWNERS b/files/tools_libyuv/ubsan/OWNERS
new file mode 100644
index 00000000..b608519a
--- /dev/null
+++ b/files/tools_libyuv/ubsan/OWNERS
@@ -0,0 +1,4 @@
+pbos@webrtc.org
+kjellander@webrtc.org
+fbarchard@chromium.org
+
diff --git a/files/tools_libyuv/ubsan/blacklist.txt b/files/tools_libyuv/ubsan/blacklist.txt
new file mode 100644
index 00000000..8bcb2907
--- /dev/null
+++ b/files/tools_libyuv/ubsan/blacklist.txt
@@ -0,0 +1,15 @@
+#############################################################################
+# UBSan blacklist.
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding WebRTC-specific blacklist entries.
+
+#############################################################################
+# YASM does some funny things that UBsan doesn't like.
+# https://crbug.com/489901
+src:*/third_party/yasm/*
+
+#############################################################################
+# Ignore system libraries.
+src:*/usr/*
diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/files/tools_libyuv/ubsan/vptr_blacklist.txt
new file mode 100644
index 00000000..8ed070c0
--- /dev/null
+++ b/files/tools_libyuv/ubsan/vptr_blacklist.txt
@@ -0,0 +1,21 @@
+#############################################################################
+# UBSan vptr blacklist.
+# Function and type based blacklisting use a mangled name, and it is especially
+# tricky to represent C++ types. For now, any possible changes by name manglings
+# are simply represented as wildcard expressions of regexp, and thus it might be
+# over-blacklisted.
+#
+# Please think twice before you add or remove these rules.
+#
+# This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
+#############################################################################
+# Using raw pointer values.
+#
+# A raw pointer value (16) is used to infer the field offset by
+# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
+
+# Example:
+# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
+
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.bat b/files/tools_libyuv/valgrind/libyuv_tests.bat
new file mode 100644
index 00000000..e37f09eb
--- /dev/null
+++ b/files/tools_libyuv/valgrind/libyuv_tests.bat
@@ -0,0 +1,79 @@
+@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS. All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+:: it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.py b/files/tools_libyuv/valgrind/libyuv_tests.py
new file mode 100755
index 00000000..e780bd95
--- /dev/null
+++ b/files/tools_libyuv/valgrind/libyuv_tests.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Runs various libyuv tests through valgrind_test.py.
+
+This script inherits the chrome_tests.py in Chrome, but allows running any test
+instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
+only supports specifying a single test for each run.
+
+Suppression files:
+The Chrome valgrind directory we use as a DEPS dependency contains the following
+suppression files:
+ valgrind/memcheck/suppressions.txt
+ valgrind/memcheck/suppressions_mac.txt
+ valgrind/tsan/suppressions.txt
+ valgrind/tsan/suppressions_mac.txt
+ valgrind/tsan/suppressions_win32.txt
+Since they're referenced from the chrome_tests.py script, we have similar files
+below the directory of this script. When executing, this script will setup both
+Chrome's suppression files and our own, so we can easily maintain libyuv
+specific suppressions in our own files.
+"""
+
+import logging
+import optparse
+import os
+import sys
+
+import logging_utils
+import path_utils
+
+import chrome_tests
+
+
+class LibyuvTest(chrome_tests.ChromeTests):
+ """Class that handles setup of suppressions for libyuv.
+
+ Everything else is inherited from chrome_tests.ChromeTests.
+ """
+
+ def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
+ """Override command-building method so we can add more suppressions."""
+ cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
+ valgrind_test_args)
+ # When ChromeTests._DefaultCommand has executed, it has setup suppression
+ # files based on what's found in the memcheck/ or tsan/ subdirectories of
+ # this script's location. If Mac or Windows is executing, additional
+ # platform specific files have also been added.
+ # Since only the ones located below this directory is added, we must also
+ # add the ones maintained by Chrome, located in ../../tools/valgrind.
+
+ # The idea is to look for --suppression arguments in the cmd list and add a
+ # modified copy of each suppression file, for the corresponding file in
+ # ../../tools/valgrind.
+ script_dir = path_utils.ScriptDir()
+ old_base, _ = os.path.split(script_dir)
+
+ checkout_src = os.path.abspath(os.path.join(script_dir, os.pardir,
+ os.pardir))
+ new_dir = os.path.join(checkout_src, 'tools', 'valgrind')
+ add_suppressions = []
+ for token in cmd:
+ if '--suppressions' in token:
+ add_suppressions.append(token.replace(script_dir, new_dir))
+ return add_suppressions + cmd
+
+
+def main(_):
+ parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
+ parser.disable_interspersed_args()
+ parser.add_option('-b', '--build-dir',
+ help=('Location of the compiler output. Can only be used '
+ 'when the test argument does not contain this path.'))
+ parser.add_option("--target", help="Debug or Release")
+ parser.add_option('-t', '--test', help='Test to run.')
+ parser.add_option('', '--baseline', action='store_true', default=False,
+ help='Generate baseline data instead of validating')
+ parser.add_option('', '--gtest_filter',
+ help='Additional arguments to --gtest_filter')
+ parser.add_option('', '--gtest_repeat',
+ help='Argument for --gtest_repeat')
+ parser.add_option("--gtest_shuffle", action="store_true", default=False,
+ help="Randomize tests' orders on every iteration.")
+ parser.add_option("--gtest_break_on_failure", action="store_true",
+ default=False,
+ help="Drop in to debugger on assertion failure. Also "
+ "useful for forcing tests to exit with a stack dump "
+ "on the first assertion failure when running with "
+ "--gtest_repeat=-1")
+ parser.add_option('-v', '--verbose', action='store_true', default=False,
+ help='Verbose output - enable debug log messages')
+ parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
+ help='Specify a valgrind tool to run the tests under')
+ parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
+ help='Specify custom flags for the selected valgrind tool')
+ parser.add_option('', '--keep_logs', action='store_true', default=False,
+ help=('Store memory tool logs in the <tool>.logs directory '
+ 'instead of /tmp.\nThis can be useful for tool '
+ 'developers/maintainers.\nPlease note that the <tool>'
+ '.logs directory will be clobbered on tool startup.'))
+ parser.add_option("--test-launcher-bot-mode", action="store_true",
+ help="run the tests with --test-launcher-bot-mode")
+ parser.add_option("--test-launcher-total-shards", type=int,
+ help="run the tests with --test-launcher-total-shards")
+ parser.add_option("--test-launcher-shard-index", type=int,
+ help="run the tests with --test-launcher-shard-index")
+ options, args = parser.parse_args()
+
+ if options.verbose:
+ logging_utils.config_root(logging.DEBUG)
+ else:
+ logging_utils.config_root()
+
+ if not options.test:
+ parser.error('--test not specified')
+
+ # Support build dir both with and without the target.
+ if (options.target and options.build_dir and
+ not options.build_dir.endswith(options.target)):
+ options.build_dir = os.path.join(options.build_dir, options.target)
+
+ # If --build_dir is provided, prepend it to the test executable if needed.
+ test_executable = options.test
+ if options.build_dir and not test_executable.startswith(options.build_dir):
+ test_executable = os.path.join(options.build_dir, test_executable)
+ args = [test_executable] + args
+
+ test = LibyuvTest(options, args, 'cmdline')
+ return test.Run()
+
+if __name__ == '__main__':
+ return_code = main(sys.argv)
+ sys.exit(return_code)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.sh b/files/tools_libyuv/valgrind/libyuv_tests.sh
new file mode 100755
index 00000000..975b5e3e
--- /dev/null
+++ b/files/tools_libyuv/valgrind/libyuv_tests.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Set up some paths and re-direct the arguments to libyuv_tests.py
+
+# This script is a copy of the chrome_tests.sh wrapper script with the following
+# changes:
+# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
+# the Valgrind framework install. If it fails a fallback path is used instead
+# (../../chromium/src/third_party/valgrind/linux_x64) and a warning message
+# is showed by |show_locate_valgrind_failed_warning|.
+# - libyuv_tests.py is invoked instead of chrome_tests.py.
+# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
+# possible to execute the Python scripts properly.
+
+export THISDIR=`dirname $0`
+ARGV_COPY="$@"
+
+# We need to set CHROME_VALGRIND iff using Memcheck:
+# tools_libyuv/valgrind/libyuv_tests.sh --tool memcheck
+# or
+# tools_libyuv/valgrind/libyuv_tests.sh --tool=memcheck
+tool="memcheck" # Default to memcheck.
+while (( "$#" ))
+do
+ if [[ "$1" == "--tool" ]]
+ then
+ tool="$2"
+ shift
+ elif [[ "$1" =~ --tool=(.*) ]]
+ then
+ tool="${BASH_REMATCH[1]}"
+ fi
+ shift
+done
+
+NEEDS_VALGRIND=0
+
+case "$tool" in
+ "memcheck")
+ NEEDS_VALGRIND=1
+ ;;
+esac
+
+# For libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
+# scripts dir to locate the Valgrind framework install
+CHROME_VALGRIND_SCRIPTS=$THISDIR/../../tools/valgrind
+
+if [ "$NEEDS_VALGRIND" == "1" ]
+then
+ CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh`
+ if [ "$CHROME_VALGRIND" = "" ]
+ then
+ CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64
+ echo
+ echo "-------------------- WARNING ------------------------"
+ echo "locate_valgrind.sh failed."
+ echo "Using $CHROME_VALGRIND as a fallback location."
+ echo "This might be because:"
+ echo "1) This is a swarming bot"
+ echo "2) You haven't set up the valgrind binaries correctly."
+ echo "In this case, please make sure you have followed the instructions at"
+ echo "http://www.chromium.org/developers/how-tos/using-valgrind/get-valgrind"
+ echo "Notice: In the .gclient file, you need to add this for the 'libyuv'"
+ echo "solution since our directory structure is different from Chromium's:"
+ echo "\"custom_deps\": {"
+ echo " \"libyuv/third_party/valgrind\":"
+ echo " \"https://chromium.googlesource.com/chromium/deps/valgrind/binaries\","
+ echo "},"
+ echo "-----------------------------------------------------"
+ echo
+ fi
+ echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+ PATH="${CHROME_VALGRIND}/bin:$PATH"
+ # We need to set these variables to override default lib paths hard-coded into
+ # Valgrind binary.
+ export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+ export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+ # Clean up some /tmp directories that might be stale due to interrupted
+ # chrome_tests.py execution.
+ # FYI:
+ # -mtime +1 <- only print files modified more than 24h ago,
+ # -print0/-0 are needed to handle possible newlines in the filenames.
+ echo "Cleanup /tmp from Valgrind stuff"
+ find /tmp -maxdepth 1 \(\
+ -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
+ \) -mtime +1 -print0 | xargs -0 rm -rf
+fi
+
+# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
+# the scripts that are needed for this script to run
+PYTHONPATH=$THISDIR/../../tools/python/google:$CHROME_VALGRIND_SCRIPTS python \
+ "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/memcheck/OWNERS b/files/tools_libyuv/valgrind/memcheck/OWNERS
new file mode 100644
index 00000000..72e8ffc0
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
new file mode 100644
index 00000000..03329214
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+import os
+import re
+import sys
+
+def CheckChange(input_api, output_api):
+ """Checks the memcheck suppressions files for bad data."""
+
+ # Add the path to the Chrome valgrind dir to the import path:
+ tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..', '..',
+ 'tools', 'valgrind')
+ sys.path.append(tools_vg_path)
+ import suppressions
+
+ sup_regex = re.compile('suppressions.*\.txt$')
+ suppressions = {}
+ errors = []
+ check_for_memcheck = False
+ # skip_next_line has 3 possible values:
+ # - False: don't skip the next line.
+ # - 'skip_suppression_name': the next line is a suppression name, skip.
+ # - 'skip_param': the next line is a system call parameter error, skip.
+ skip_next_line = False
+ for f in filter(lambda x: sup_regex.search(x.LocalPath()),
+ input_api.AffectedFiles()):
+ for line, line_num in zip(f.NewContents(),
+ xrange(1, len(f.NewContents()) + 1)):
+ line = line.lstrip()
+ if line.startswith('#') or not line:
+ continue
+
+ if skip_next_line:
+ if skip_next_line == 'skip_suppression_name':
+ if 'insert_a_suppression_name_here' in line:
+ errors.append('"insert_a_suppression_name_here" is not a valid '
+ 'suppression name')
+ if suppressions.has_key(line):
+ if f.LocalPath() == suppressions[line][1]:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][1]))
+ else:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at %s line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][0], suppressions[line][1]))
+ else:
+ suppressions[line] = (f, line_num)
+ check_for_memcheck = True;
+ skip_next_line = False
+ continue
+ if check_for_memcheck:
+ if not line.startswith('Memcheck:'):
+ errors.append('"%s" should be "Memcheck:..." in %s line %s' %
+ (line, f.LocalPath(), line_num))
+ check_for_memcheck = False;
+ if line == '{':
+ skip_next_line = 'skip_suppression_name'
+ continue
+ if line == "Memcheck:Param":
+ skip_next_line = 'skip_param'
+ continue
+
+ if (line.startswith('fun:') or line.startswith('obj:') or
+ line.startswith('Memcheck:') or line == '}' or
+ line == '...'):
+ continue
+ errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
+ line_num))
+ if errors:
+ return [output_api.PresubmitError('\n'.join(errors))]
+ return []
+
+def CheckChangeOnUpload(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+ # We don't have any memcheck slaves yet, so there's no use for this method.
+ # When we have, the slave name(s) should be put into this list.
+ return []
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions.txt b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
new file mode 100644
index 00000000..3ad0c8cc
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
new file mode 100644
index 00000000..3ad0c8cc
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
new file mode 100644
index 00000000..3ad0c8cc
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/files/unit_test/color_test.cc b/files/unit_test/color_test.cc
index 36041d99..0aa7a54a 100644
--- a/files/unit_test/color_test.cc
+++ b/files/unit_test/color_test.cc
@@ -10,13 +10,13 @@
#include <stdlib.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
#include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
@@ -38,110 +38,103 @@ namespace libyuv {
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
TEST_F(LibYUVColorTest, TESTNAME) { \
- const int kPixels = benchmark_width_ * benchmark_height_; \
- const int kHalfPixels = ((benchmark_width_ + 1) / 2) * \
- ((benchmark_height_ + HS1) / HS); \
- align_buffer_page_end(orig_y, kPixels); \
- align_buffer_page_end(orig_u, kHalfPixels); \
- align_buffer_page_end(orig_v, kHalfPixels); \
- align_buffer_page_end(orig_pixels, kPixels * 4); \
- align_buffer_page_end(temp_y, kPixels); \
- align_buffer_page_end(temp_u, kHalfPixels); \
- align_buffer_page_end(temp_v, kHalfPixels); \
- align_buffer_page_end(dst_pixels_opt, kPixels * 4); \
- align_buffer_page_end(dst_pixels_c, kPixels * 4); \
+ const int kPixels = benchmark_width_ * benchmark_height_; \
+ const int kHalfPixels = \
+ ((benchmark_width_ + 1) / 2) * ((benchmark_height_ + HS1) / HS); \
+ align_buffer_page_end(orig_y, kPixels); \
+ align_buffer_page_end(orig_u, kHalfPixels); \
+ align_buffer_page_end(orig_v, kHalfPixels); \
+ align_buffer_page_end(orig_pixels, kPixels * 4); \
+ align_buffer_page_end(temp_y, kPixels); \
+ align_buffer_page_end(temp_u, kHalfPixels); \
+ align_buffer_page_end(temp_v, kHalfPixels); \
+ align_buffer_page_end(dst_pixels_opt, kPixels * 4); \
+ align_buffer_page_end(dst_pixels_c, kPixels * 4); \
\
- MemRandomize(orig_pixels, kPixels * 4); \
- MemRandomize(orig_y, kPixels); \
- MemRandomize(orig_u, kHalfPixels); \
- MemRandomize(orig_v, kHalfPixels); \
- MemRandomize(temp_y, kPixels); \
- MemRandomize(temp_u, kHalfPixels); \
- MemRandomize(temp_v, kHalfPixels); \
- MemRandomize(dst_pixels_opt, kPixels * 4); \
- MemRandomize(dst_pixels_c, kPixels * 4); \
+ MemRandomize(orig_pixels, kPixels * 4); \
+ MemRandomize(orig_y, kPixels); \
+ MemRandomize(orig_u, kHalfPixels); \
+ MemRandomize(orig_v, kHalfPixels); \
+ MemRandomize(temp_y, kPixels); \
+ MemRandomize(temp_u, kHalfPixels); \
+ MemRandomize(temp_v, kHalfPixels); \
+ MemRandomize(dst_pixels_opt, kPixels * 4); \
+ MemRandomize(dst_pixels_c, kPixels * 4); \
\
- /* The test is overall for color conversion matrix being reversible, so */ \
- /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
- uint8* p = orig_y; \
- for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
- for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
- uint8 r = static_cast<uint8>(fastrand()); \
- p[0] = r; \
- p[1] = r; \
- p[HN] = r; \
- p[HN + 1] = r; \
- p += 2; \
- } \
- if (benchmark_width_ & 1) { \
- uint8 r = static_cast<uint8>(fastrand()); \
- p[0] = r; \
- p[HN] = r; \
- p += 1; \
+ /* The test is overall for color conversion matrix being reversible, so */ \
+ /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
+ uint8* p = orig_y; \
+ for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
+ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
+ uint8 r = static_cast<uint8>(fastrand()); \
+ p[0] = r; \
+ p[1] = r; \
+ p[HN] = r; \
+ p[HN + 1] = r; \
+ p += 2; \
+ } \
+ if (benchmark_width_ & 1) { \
+ uint8 r = static_cast<uint8>(fastrand()); \
+ p[0] = r; \
+ p[HN] = r; \
+ p += 1; \
+ } \
+ p += HN; \
} \
- p += HN; \
- } \
- if ((benchmark_height_ & 1) && HS == 2) { \
- for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
- uint8 r = static_cast<uint8>(fastrand()); \
- p[0] = r; \
- p[1] = r; \
- p += 2; \
+ if ((benchmark_height_ & 1) && HS == 2) { \
+ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
+ uint8 r = static_cast<uint8>(fastrand()); \
+ p[0] = r; \
+ p[1] = r; \
+ p += 2; \
+ } \
+ if (benchmark_width_ & 1) { \
+ uint8 r = static_cast<uint8>(fastrand()); \
+ p[0] = r; \
+ p += 1; \
+ } \
} \
- if (benchmark_width_ & 1) { \
- uint8 r = static_cast<uint8>(fastrand()); \
- p[0] = r; \
- p += 1; \
- } \
- } \
- /* Start with YUV converted to ARGB. */ \
- YUVTOARGB(orig_y, benchmark_width_, \
- orig_u, (benchmark_width_ + 1) / 2, \
- orig_v, (benchmark_width_ + 1) / 2, \
- orig_pixels, benchmark_width_ * 4, \
- benchmark_width_, benchmark_height_); \
+ /* Start with YUV converted to ARGB. */ \
+ YUVTOARGB(orig_y, benchmark_width_, orig_u, (benchmark_width_ + 1) / 2, \
+ orig_v, (benchmark_width_ + 1) / 2, orig_pixels, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
\
- ARGBTOYUV(orig_pixels, benchmark_width_ * 4, \
- temp_y, benchmark_width_, \
- temp_u, (benchmark_width_ + 1) / 2, \
- temp_v, (benchmark_width_ + 1) / 2, \
- benchmark_width_, benchmark_height_); \
+ ARGBTOYUV(orig_pixels, benchmark_width_ * 4, temp_y, benchmark_width_, \
+ temp_u, (benchmark_width_ + 1) / 2, temp_v, \
+ (benchmark_width_ + 1) / 2, benchmark_width_, \
+ benchmark_height_); \
\
- MaskCpuFlags(disable_cpu_flags_); \
- YUVTOARGB(temp_y, benchmark_width_, \
- temp_u, (benchmark_width_ + 1) / 2, \
- temp_v, (benchmark_width_ + 1) / 2, \
- dst_pixels_c, benchmark_width_ * 4, \
- benchmark_width_, benchmark_height_); \
- MaskCpuFlags(benchmark_cpu_info_); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
+ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_c, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
+ MaskCpuFlags(benchmark_cpu_info_); \
\
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- YUVTOARGB(temp_y, benchmark_width_, \
- temp_u, (benchmark_width_ + 1) / 2, \
- temp_v, (benchmark_width_ + 1) / 2, \
- dst_pixels_opt, benchmark_width_ * 4, \
- benchmark_width_, benchmark_height_); \
- } \
- /* Test C and SIMD match. */ \
- for (int i = 0; i < kPixels * 4; ++i) { \
- EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
- } \
- /* Test SIMD is close to original. */ \
- for (int i = 0; i < kPixels * 4; ++i) { \
- EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \
- static_cast<int>(dst_pixels_opt[i]), DIFF); \
- } \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \
+ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_opt, \
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \
+ } \
+ /* Test C and SIMD match. */ \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
+ } \
+ /* Test SIMD is close to original. */ \
+ for (int i = 0; i < kPixels * 4; ++i) { \
+ EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \
+ static_cast<int>(dst_pixels_opt[i]), DIFF); \
+ } \
\
- free_aligned_buffer_page_end(orig_pixels); \
- free_aligned_buffer_page_end(orig_y); \
- free_aligned_buffer_page_end(orig_u); \
- free_aligned_buffer_page_end(orig_v); \
- free_aligned_buffer_page_end(temp_y); \
- free_aligned_buffer_page_end(temp_u); \
- free_aligned_buffer_page_end(temp_v); \
- free_aligned_buffer_page_end(dst_pixels_opt); \
- free_aligned_buffer_page_end(dst_pixels_c); \
-} \
+ free_aligned_buffer_page_end(orig_pixels); \
+ free_aligned_buffer_page_end(orig_y); \
+ free_aligned_buffer_page_end(orig_u); \
+ free_aligned_buffer_page_end(orig_v); \
+ free_aligned_buffer_page_end(temp_y); \
+ free_aligned_buffer_page_end(temp_u); \
+ free_aligned_buffer_page_end(temp_v); \
+ free_aligned_buffer_page_end(dst_pixels_opt); \
+ free_aligned_buffer_page_end(dst_pixels_c); \
+ }
TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL)
TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL)
@@ -163,11 +156,8 @@ static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) {
memset(orig_v, v, kHalfPixels);
/* YUV converted to ARGB. */
- I422ToARGB(orig_y, kWidth,
- orig_u, (kWidth + 1) / 2,
- orig_v, (kWidth + 1) / 2,
- orig_pixels, kWidth * 4,
- kWidth, kHeight);
+ I422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
*b = orig_pixels[0];
*g = orig_pixels[1];
@@ -189,11 +179,8 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
memset(orig_v, v, kHalfPixels);
/* YUV converted to ARGB. */
- J422ToARGB(orig_y, kWidth,
- orig_u, (kWidth + 1) / 2,
- orig_v, (kWidth + 1) / 2,
- orig_pixels, kWidth * 4,
- kWidth, kHeight);
+ J422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
*b = orig_pixels[0];
*g = orig_pixels[1];
@@ -248,7 +235,7 @@ static void YJToRGB(int y, int* r, int* g, int* b) {
#if defined(CLAMPMETHOD_IF)
static int RoundToByte(float f) {
- int i = ROUND(f);
+ int i = ROUND(f);
if (i < 0) {
i = 0;
}
@@ -259,52 +246,61 @@ static int RoundToByte(float f) {
}
#elif defined(CLAMPMETHOD_TABLE)
static const unsigned char clamptable[811] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
- 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
- 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
- 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
- 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
- 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
- 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
- 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
- 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
- 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
- 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
- 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
- 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
- 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
- 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
- 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
-};
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+ 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
+ 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+ 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+ 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
+ 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
+ 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
+ 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
+ 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255};
static int RoundToByte(float f) {
return clamptable[ROUND(f) + 276];
@@ -317,7 +313,7 @@ static int RoundToByte(float f) {
#elif defined(CLAMPMETHOD_MASK)
static int RoundToByte(float f) {
int i = ROUND(f);
- i = ((-(i) >> 31) & (i)); // clamp to 0.
+ i = ((-(i) >> 31) & (i)); // clamp to 0.
return (((255 - (i)) >> 31) | (i)) & 255; // clamp to 255.
}
#endif
@@ -433,7 +429,6 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {
EXPECT_EQ(130, g1);
EXPECT_EQ(130, b1);
-
for (int y = 0; y < 256; ++y) {
YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
YUVToRGB(y, 128, 128, &r1, &g1, &b1);
@@ -477,7 +472,17 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
}
TEST_F(LibYUVColorTest, TestFullYUV) {
- int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+ int rh[256] =
+ {
+ 0,
+ },
+ gh[256] =
+ {
+ 0,
+ },
+ bh[256] = {
+ 0,
+ };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; ++y2) {
@@ -498,7 +503,17 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
}
TEST_F(LibYUVColorTest, TestFullYUVJ) {
- int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+ int rh[256] =
+ {
+ 0,
+ },
+ gh[256] =
+ {
+ 0,
+ },
+ bh[256] = {
+ 0,
+ };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; ++y2) {
diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc
index a8ce671d..13f74705 100644
--- a/files/unit_test/compare_test.cc
+++ b/files/unit_test/compare_test.cc
@@ -36,7 +36,8 @@ TEST_F(LibYUVBaseTest, Djb2_Test) {
align_buffer_page_end(src_a, kMaxTest);
align_buffer_page_end(src_b, kMaxTest);
- const char* fox = "The quick brown fox jumps over the lazy dog"
+ const char* fox =
+ "The quick brown fox jumps over the lazy dog"
" and feels as if he were in the seventh heaven of typography"
" together with Hermann Zapf";
uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
@@ -155,21 +156,21 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
}
src_a[0] = 0;
- fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
src_a[0] = 255;
src_a[3] = 0;
- fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
src_a[3] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
- fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
}
- EXPECT_EQ(0, fourcc);
+ EXPECT_EQ(0u, fourcc);
free_aligned_buffer_page_end(src_a);
}
@@ -183,21 +184,21 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
}
src_a[0 + 1] = 0;
- fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
src_a[0 + 1] = 255;
src_a[3 + 1] = 0;
- fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
+ EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
src_a[3 + 1] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
- fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
+ benchmark_height_);
}
- EXPECT_EQ(0, fourcc);
+ EXPECT_EQ(0u, fourcc);
free_aligned_buffer_page_end(src_a);
}
@@ -220,13 +221,14 @@ TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
- int count = benchmark_iterations_ *
- ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
}
- EXPECT_EQ(0, h1);
+ EXPECT_EQ(0u, h1);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
@@ -242,18 +244,18 @@ TEST_F(LibYUVBaseTest, SumSquareError) {
uint64 err;
err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
- EXPECT_EQ(0, err);
+ EXPECT_EQ(0u, err);
memset(src_a, 1, kMaxWidth);
err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
- EXPECT_EQ(err, kMaxWidth);
+ EXPECT_EQ(static_cast<int>(err), kMaxWidth);
memset(src_a, 190, kMaxWidth);
memset(src_b, 193, kMaxWidth);
err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
- EXPECT_EQ(kMaxWidth * 3 * 3, err);
+ EXPECT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3);
for (int i = 0; i < kMaxWidth; ++i) {
src_a[i] = (fastrand() & 0xff);
@@ -284,8 +286,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
- CalcFramePsnr(src_a, benchmark_width_,
- src_b, benchmark_width_,
+ CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@@ -309,8 +310,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
- CalcFramePsnr(src_a + 1, benchmark_width_,
- src_b, benchmark_width_,
+ CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@@ -335,24 +335,24 @@ TEST_F(LibYUVBaseTest, Psnr) {
double err;
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
EXPECT_EQ(err, kMaxPsnr);
memset(src_a, 255, kSrcPlaneSize);
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
EXPECT_EQ(err, 0.0);
memset(src_a, 1, kSrcPlaneSize);
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
EXPECT_GT(err, 48.0);
EXPECT_LT(err, 49.0);
@@ -362,8 +362,8 @@ TEST_F(LibYUVBaseTest, Psnr) {
}
err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
EXPECT_GT(err, 2.0);
if (kSrcWidth * kSrcHeight >= 256) {
@@ -384,14 +384,14 @@ TEST_F(LibYUVBaseTest, Psnr) {
double c_err, opt_err;
c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
EXPECT_EQ(opt_err, c_err);
@@ -411,8 +411,7 @@ TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
- CalcFrameSsim(src_a, benchmark_width_,
- src_b, benchmark_width_,
+ CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
@@ -435,14 +434,14 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 0, kSrcPlaneSize);
memset(src_b, 0, kSrcPlaneSize);
- if (kSrcWidth <=8 || kSrcHeight <= 8) {
+ if (kSrcWidth <= 8 || kSrcHeight <= 8) {
printf("warning - Ssim size too small. Testing function executes.\n");
}
double err;
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_EQ(err, 1.0);
@@ -451,8 +450,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 255, kSrcPlaneSize);
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_LT(err, 0.0001);
@@ -461,8 +460,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
memset(src_a, 1, kSrcPlaneSize);
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_GT(err, 0.0001);
@@ -474,8 +473,8 @@ TEST_F(LibYUVBaseTest, Ssim) {
}
err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_GT(err, 0.0);
@@ -493,14 +492,14 @@ TEST_F(LibYUVBaseTest, Ssim) {
double c_err, opt_err;
c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
MaskCpuFlags(benchmark_cpu_info_);
opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
- src_b + kSrcStride * b + b, kSrcStride,
- kSrcWidth, kSrcHeight);
+ src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
+ kSrcHeight);
if (kSrcWidth > 8 && kSrcHeight > 8) {
EXPECT_EQ(opt_err, c_err);
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
index 56a2bfd8..0f1c7430 100644
--- a/files/unit_test/convert_test.cc
+++ b/files/unit_test/convert_test.cc
@@ -21,470 +21,546 @@
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
+#include "../unit_test/unit_test.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/video_common.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_page_end(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_page_end(dst_y_c, kWidth * kHeight); \
- align_buffer_page_end(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth * kHeight); \
- align_buffer_page_end(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth * kHeight); \
- memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_EQ(0, max_diff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 3); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 3); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
-}
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_EQ(0, max_diff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
-TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
-TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_page_end(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_page_end(dst_y_c, kWidth * kHeight); \
- align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth * kHeight); \
- align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_uv_c, 2, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth * kHeight); \
- memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_uv_opt, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_uv_opt[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
-}
+// Test Android 420 to I420
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ uint8* src_u = src_uv + OFF_U; \
+ uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
+ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \
+ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_EQ(0, max_diff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \
+ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, \
+ _Any, +, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \
+ _Unaligned, +, 1, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+ -, 0, PN, OFF_U, OFF_V) \
+ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+ 0, PN, OFF_U, OFF_V)
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
+ dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>( \
+ dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ }
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_page_end(dst_y_c, kWidth * kHeight); \
- align_buffer_page_end(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth * kHeight); \
- align_buffer_page_end(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
+ DOY) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (fastrand() & 0xff); \
+ } \
} \
- } \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth * kHeight); \
- memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
+ kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ if (DOY) { \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
} \
+ EXPECT_LE(max_diff, 1); \
} \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
} \
} \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = abs( \
+ static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>( \
+ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
} \
} \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
-}
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
+ 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_c + OFF, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_opt + OFF, kStrideB, \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \
- FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \
- dst_argb32_c, kWidth * BPP_C , \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \
- dst_argb32_opt, kWidth * BPP_C , \
- kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
} \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
-}
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
+ kWidth * BPP_C, kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \
+ static_cast<int>(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
@@ -507,7 +583,6 @@ TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
@@ -519,247 +594,275 @@ TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(src_a, kWidth * kHeight + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- src_a[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- src_a + OFF, kWidth, \
- dst_argb_c + OFF, kStrideB, \
- kWidth, NEG kHeight, ATTEN); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- src_a + OFF, kWidth, \
- dst_argb_opt + OFF, kStrideB, \
- kWidth, NEG kHeight, ATTEN); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i + OFF]) - \
- static_cast<int>(dst_argb_opt[i + OFF])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
} \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(src_a); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
-}
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \
+ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
+ ATTEN); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) - \
+ static_cast<int>(dst_argb_opt[i + OFF])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
-#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \
- TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \
+ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
W1280, DIFF, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = kWidth * BPP_B; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_uv, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV * 2; ++j) { \
- src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = kWidth * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \
+ } \
} \
- } \
- memset(dst_argb_c, 1, kStrideB * kHeight); \
- memset(dst_argb_opt, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, kStrideUV * 2, \
- dst_argb_c, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, kStrideUV * 2, \
- dst_argb_opt, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- } \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
- memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
- memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
- FMT_B##ToARGB(dst_argb_c, kStrideB, \
- dst_argb32_c, kWidth * 4, \
- kWidth, kHeight); \
- FMT_B##ToARGB(dst_argb_opt, kStrideB, \
- dst_argb32_opt, kWidth * 4, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth * 4; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
- static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ memset(dst_argb_c, 1, kStrideB* kHeight); \
+ memset(dst_argb_opt, 101, kStrideB* kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+ dst_argb_opt, kWidth * BPP_B, kWidth, \
+ NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
+ memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
+ memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
+ FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
+ kHeight); \
+ FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+ kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * 4; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
+ static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
} \
} \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_uv); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
-}
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Opt, +, 0)
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Opt, +, 0)
TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+#ifdef DO_THREE_PLANES
+// Do 3 allocations for yuv. conventional but slower.
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_u_opt, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_v_opt, \
+ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \
+ kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \
+ kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
+ static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \
+ static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \
+ } \
+ } \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]), \
+ static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF); \
+ } \
+ } \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_u_c); \
+ free_aligned_buffer_page_end(dst_v_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_u_opt); \
+ free_aligned_buffer_page_end(dst_v_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+#else
#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
W1280, DIFF, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kStride = \
- (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
- align_buffer_page_end(src_argb, kStride * kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth * kHeight); \
- align_buffer_page_end(dst_u_c, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth * kHeight); \
- align_buffer_page_end(dst_u_opt, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, \
- kStrideUV * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_u_c, 2, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth * kHeight); \
- memset(dst_u_opt, 102, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, \
- kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, \
- dst_u_c, kStrideUV, \
- dst_v_c, kStrideUV, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_u_opt, kStrideUV, \
- dst_v_opt, kStrideUV, \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \
kWidth, NEG kHeight); \
- } \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
- static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
} \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \
- static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \
+ static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \
+ } \
} \
- } \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV; ++j) { \
- EXPECT_NEAR(static_cast<int>(dst_v_c[i * \
- kStrideUV + j]), \
- static_cast<int>(dst_v_opt[i * \
- kStrideUV + j]), DIFF); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \
+ for (int j = 0; j < kStrideUV; ++j) { \
+ EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]), \
+ static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF); \
+ } \
} \
- } \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_u_c); \
- free_aligned_buffer_page_end(dst_v_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_u_opt); \
- free_aligned_buffer_page_end(dst_v_opt); \
- free_aligned_buffer_page_end(src_argb); \
-}
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
+#endif
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- DIFF) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Opt, +, 0)
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ DIFF) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Opt, +, 0)
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-#if defined(__arm__) || defined (__aarch64__)
+#if defined(__arm__) || defined(__aarch64__)
// arm version subsamples by summing 4 pixels then multiplying by matrix with
// 4x smaller coefficients which are rounded to nearest integer.
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
@@ -777,7 +880,6 @@ TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4)
TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
@@ -787,183 +889,173 @@ TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
-#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- align_buffer_page_end(src_argb, kStride * kHeight + OFF); \
- align_buffer_page_end(dst_y_c, kWidth * kHeight); \
- align_buffer_page_end(dst_uv_c, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth * kHeight); \
- align_buffer_page_end(dst_uv_opt, \
- kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth * kHeight); \
- memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, dst_uv_c, kStrideUV * 2, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < kStrideUV * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \
- static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- free_aligned_buffer_page_end(dst_y_c); \
- free_aligned_buffer_page_end(dst_uv_c); \
- free_aligned_buffer_page_end(dst_y_opt); \
- free_aligned_buffer_page_end(dst_uv_opt); \
- free_aligned_buffer_page_end(src_argb); \
-}
+#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
+ SUBSAMP_Y, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, \
+ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+ kStrideUV * 2, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \
+ dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < kStrideUV * 2; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \
+ static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_argb); \
+ }
#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideB * kHeightB); \
- memset(dst_argb_opt, 101, kStrideB * kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
-}
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \
+ NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (fastrand() & 63) + 1; \
- const int kHeight = (fastrand() & 31) + 1; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
- align_buffer_page_end(src_argb, kStrideA * kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 123, kStrideB * kHeightB); \
- memset(dst_argb_opt, 123, kStrideB * kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- } \
-}
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
+ STRIDE_B, HEIGHT_B, DIFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \
+ kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
+ kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ } \
+ }
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF)
TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
@@ -989,6 +1081,7 @@ TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
@@ -996,159 +1089,146 @@ TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideB * kHeightB); \
- memset(dst_argb_opt, 101, kStrideB * kHeightB); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \
- dst_argb_c, kStrideB, \
- NULL, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \
- dst_argb_opt, kStrideB, \
- NULL, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 1, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 101, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
+ NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \
+ STRIDE_B, HEIGHT_B, DIFF) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (fastrand() & 63) + 1; \
+ const int kHeight = (fastrand() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c, 123, kStrideB* kHeightB); \
+ memset(dst_argb_opt, 123, kStrideB* kHeightB); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
+ kWidth, kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \
+ NULL, kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
} \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
-}
+ }
+
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+ HEIGHT_B, DIFF)
+
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
-TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (fastrand() & 63) + 1; \
- const int kHeight = (fastrand() & 31) + 1; \
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
- align_buffer_page_end(src_argb, kStrideA * kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \
+ align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \
for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (fastrand() & 0xff); \
+ src_argb[i + OFF] = (fastrand() & 0xff); \
} \
- memset(dst_argb_c, 123, kStrideB * kHeightB); \
- memset(dst_argb_opt, 123, kStrideB * kHeightB); \
+ memset(dst_argb_c, 1, kStrideA* kHeightA); \
+ memset(dst_argb_opt, 101, kStrideA* kHeightA); \
MaskCpuFlags(disable_cpu_flags_); \
- FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \
- dst_argb_c, kStrideB, \
- NULL, kWidth, kHeight); \
+ FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \
+ NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
- FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \
- dst_argb_opt, kStrideB, \
- NULL, kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \
+ NEG kHeight); \
+ } \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \
+ NEG kHeight); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
- } \
-}
-
-#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
- benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
-
-TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-
-#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
- W1280, N, NEG, OFF) \
-TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideA * kHeightA); \
- align_buffer_page_end(dst_argb_opt, kStrideA * kHeightA); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c, 1, kStrideA * kHeightA); \
- memset(dst_argb_opt, 101, kStrideA * kHeightA); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_ATOB(src_argb + OFF, kStrideA, \
- dst_argb_c, kStrideA, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_ATOB(src_argb + OFF, kStrideA, \
- dst_argb_opt, kStrideA, \
- kWidth, NEG kHeight); \
- } \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_ATOB(dst_argb_c, kStrideA, \
- dst_argb_c, kStrideA, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- FMT_ATOB(dst_argb_opt, kStrideA, \
- dst_argb_opt, kStrideA, \
- kWidth, NEG kHeight); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \
- EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
- } \
- free_aligned_buffer_page_end(src_argb); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
-}
+ }
#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \
- benchmark_width_, _Opt, +, 0)
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \
+ 0) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \
+ +, 1) \
+ TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0)
TESTSYM(ARGBToARGB, 4, 4, 1)
TESTSYM(ARGBToBGRA, 4, 4, 1)
@@ -1174,8 +1254,9 @@ TEST_F(LibYUVConvertTest, Test565) {
TEST_F(LibYUVConvertTest, ValidateJpeg) {
const int kOff = 10;
const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
@@ -1201,8 +1282,9 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
const int kOff = 10;
const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
const int kSize = kImageSize + kOff;
const int kMultiple = 10;
const int kBufSize = kImageSize * kMultiple + kOff;
@@ -1226,8 +1308,9 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
TEST_F(LibYUVConvertTest, InvalidateJpeg) {
const int kOff = 10;
const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
@@ -1280,17 +1363,16 @@ TEST_F(LibYUVConvertTest, FuzzJpeg) {
TEST_F(LibYUVConvertTest, MJPGToI420) {
const int kOff = 10;
const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
- align_buffer_page_end(dst_u_opt,
- SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
- align_buffer_page_end(dst_v_opt,
- SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
+ align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
+ SUBSAMPLE(benchmark_height_, 2));
+ align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
+ SUBSAMPLE(benchmark_height_, 2));
// EOI, SOI to make MJPG appear valid.
memset(orig_pixels, 0, kSize);
@@ -1300,12 +1382,11 @@ TEST_F(LibYUVConvertTest, MJPGToI420) {
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret = MJPGToI420(orig_pixels, kSize,
- dst_y_opt, benchmark_width_,
- dst_u_opt, SUBSAMPLE(benchmark_width_, 2),
- dst_v_opt, SUBSAMPLE(benchmark_width_, 2),
- benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_);
+ int ret =
+ MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
+ SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
+ SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
+ benchmark_height_, benchmark_width_, benchmark_height_);
// Expect failure because image is not really valid.
EXPECT_EQ(1, ret);
}
@@ -1319,8 +1400,9 @@ TEST_F(LibYUVConvertTest, MJPGToI420) {
TEST_F(LibYUVConvertTest, MJPGToARGB) {
const int kOff = 10;
const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+ ? benchmark_width_ * benchmark_height_
+ : kMinJpeg;
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
@@ -1333,10 +1415,9 @@ TEST_F(LibYUVConvertTest, MJPGToARGB) {
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret = MJPGToARGB(orig_pixels, kSize,
- dst_argb_opt, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_);
+ int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_);
// Expect failure because image is not really valid.
EXPECT_EQ(1, ret);
}
@@ -1353,66 +1434,53 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
const int kWidth = benchmark_width_;
const int kHeight = benchmark_height_;
const int crop_y =
- ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+ ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
const int kDestWidth = benchmark_width_;
const int kDestHeight = benchmark_height_ - crop_y * 2;
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
- const int sample_size = kWidth * kHeight +
- kStrideUV *
- SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+ const int sample_size =
+ kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
align_buffer_page_end(src_y, sample_size);
uint8* src_uv = src_y + kWidth * kHeight;
align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
- align_buffer_page_end(dst_u,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- align_buffer_page_end(dst_v,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
- align_buffer_page_end(dst_u_2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- align_buffer_page_end(dst_v_2,
- SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
for (int i = 0; i < kHeight * kWidth; ++i) {
src_y[i] = (fastrand() & 0xff);
}
- for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) *
- kStrideUV) * 2; ++i) {
+ for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
src_uv[i] = (fastrand() & 0xff);
}
memset(dst_y, 1, kDestWidth * kDestHeight);
- memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
memset(dst_y_2, 1, kDestWidth * kDestHeight);
- memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
- memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
- SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
- ConvertToI420(src_y, sample_size,
- dst_y_2, kDestWidth,
- dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
- dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
- 0, crop_y,
- kWidth, kHeight,
- kDestWidth, kDestHeight,
- libyuv::kRotate0, libyuv::FOURCC_NV12);
+ memset(dst_u_2, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v_2, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
NV12ToI420(src_y + crop_y * kWidth, kWidth,
- src_uv + (crop_y / 2) * kStrideUV * 2,
- kStrideUV * 2,
- dst_y, kDestWidth,
- dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X),
- dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X),
- kDestWidth, kDestHeight);
+ src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
+ kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
for (int i = 0; i < kDestHeight; ++i) {
for (int j = 0; j < kDestWidth; ++j) {
@@ -1452,10 +1520,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) {
for (int i = 0; i < 32; ++i) {
printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
- argb[i * 4 + 0],
- argb[i * 4 + 1],
- argb[i * 4 + 2],
- argb[i * 4 + 3]);
+ argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
}
for (int i = 0; i < 32; ++i) {
EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
@@ -1463,10 +1528,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) {
}
static const uint8 kNoDither4x4[16] = {
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
TEST_F(LibYUVConvertTest, TestNoDither) {
@@ -1477,12 +1539,11 @@ TEST_F(LibYUVConvertTest, TestNoDither) {
MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
- ARGBToRGB565(src_argb, benchmark_width_ * 4,
- dst_rgb565, benchmark_width_ * 2,
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
benchmark_width_, benchmark_height_);
- ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
- dst_rgb565dither, benchmark_width_ * 2,
- kNoDither4x4, benchmark_width_, benchmark_height_);
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
+ benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
}
@@ -1494,10 +1555,7 @@ TEST_F(LibYUVConvertTest, TestNoDither) {
// Ordered 4x4 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
- 0, 4, 1, 5,
- 6, 2, 7, 3,
- 1, 5, 0, 4,
- 7, 3, 6, 2,
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
TEST_F(LibYUVConvertTest, TestDither) {
@@ -1513,18 +1571,15 @@ TEST_F(LibYUVConvertTest, TestDither) {
MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
- ARGBToRGB565(src_argb, benchmark_width_ * 4,
- dst_rgb565, benchmark_width_ * 2,
+ ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
benchmark_width_, benchmark_height_);
- ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
- dst_rgb565dither, benchmark_width_ * 2,
- kDither565_4x4, benchmark_width_, benchmark_height_);
- RGB565ToARGB(dst_rgb565, benchmark_width_ * 2,
- dst_argb, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
- RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2,
- dst_argbdither, benchmark_width_ * 4,
+ ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+ benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
+ benchmark_height_);
+ RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
benchmark_width_, benchmark_height_);
+ RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
@@ -1537,218 +1592,197 @@ TEST_F(LibYUVConvertTest, TestDither) {
}
#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_c + OFF, kStrideB, \
- NULL, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_opt + OFF, kStrideB, \
- NULL, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \
- FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \
- dst_argb32_c, kWidth * BPP_C , \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \
- dst_argb32_opt, kWidth * BPP_C , \
- kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
+ YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
} \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
-}
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, \
+ kStrideB, NULL, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B##Dither( \
+ src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
+ dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
+ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
+ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
+ kWidth * BPP_C, kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \
+ static_cast<int>(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ free_aligned_buffer_page_end(dst_argb32_c); \
+ free_aligned_buffer_page_end(dst_argb32_opt); \
+ }
#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+ YALIGN, DIFF, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
+ BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
-#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
-TEST_F(LibYUVConvertTest, NAME) { \
- const int kWidth = benchmark_width_; \
- const int kHeight = benchmark_height_; \
- \
- align_buffer_page_end(orig_uyvy, \
- 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
- align_buffer_page_end(orig_y, kWidth * kHeight); \
- align_buffer_page_end(orig_u, \
- SUBSAMPLE(kWidth, 2) * \
- SUBSAMPLE(kHeight, 2)); \
- align_buffer_page_end(orig_v, \
- SUBSAMPLE(kWidth, 2) * \
- SUBSAMPLE(kHeight, 2)); \
- \
- align_buffer_page_end(dst_y_orig, kWidth * kHeight); \
- align_buffer_page_end(dst_uv_orig, 2 * \
- SUBSAMPLE(kWidth, 2) * \
- SUBSAMPLE(kHeight, 2)); \
- \
- align_buffer_page_end(dst_y, kWidth * kHeight); \
- align_buffer_page_end(dst_uv, 2 * \
- SUBSAMPLE(kWidth, 2) * \
- SUBSAMPLE(kHeight, 2)); \
- \
- MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
- \
- /* Convert UYVY to NV12 in 2 steps for reference */ \
- libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), \
- orig_y, kWidth, \
- orig_u, SUBSAMPLE(kWidth, 2), \
- orig_v, SUBSAMPLE(kWidth, 2), \
- kWidth, kHeight); \
- libyuv::I420ToNV12(orig_y, kWidth, \
- orig_u, SUBSAMPLE(kWidth, 2), \
- orig_v, SUBSAMPLE(kWidth, 2), \
- dst_y_orig, kWidth, \
- dst_uv_orig, 2 * SUBSAMPLE(kWidth, 2), \
- kWidth, kHeight); \
- \
- /* Convert to NV12 */ \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), \
- dst_y, kWidth, \
- dst_uv, 2 * SUBSAMPLE(kWidth, 2), \
- kWidth, kHeight); \
- } \
- \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- EXPECT_EQ(orig_y[i], dst_y[i]); \
- } \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- EXPECT_EQ(dst_y_orig[i], dst_y[i]); \
- } \
- for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); ++i) { \
- EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \
- } \
- \
- free_aligned_buffer_page_end(orig_uyvy); \
- free_aligned_buffer_page_end(orig_y); \
- free_aligned_buffer_page_end(orig_u); \
- free_aligned_buffer_page_end(orig_v); \
- free_aligned_buffer_page_end(dst_y_orig); \
- free_aligned_buffer_page_end(dst_uv_orig); \
- free_aligned_buffer_page_end(dst_y); \
- free_aligned_buffer_page_end(dst_uv); \
-}
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \
+ TEST_F(LibYUVConvertTest, NAME) { \
+ const int kWidth = benchmark_width_; \
+ const int kHeight = benchmark_height_; \
+ \
+ align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ align_buffer_page_end(orig_y, kWidth* kHeight); \
+ align_buffer_page_end(orig_u, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ align_buffer_page_end(orig_v, \
+ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y_orig, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ align_buffer_page_end(dst_y, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv, \
+ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \
+ \
+ MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \
+ \
+ /* Convert UYVY to NV12 in 2 steps for reference */ \
+ libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \
+ orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \
+ SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
+ 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ \
+ /* Convert to NV12 */ \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \
+ dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \
+ } \
+ \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(orig_y[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ EXPECT_EQ(dst_y_orig[i], dst_y[i]); \
+ } \
+ for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \
+ ++i) { \
+ EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \
+ } \
+ \
+ free_aligned_buffer_page_end(orig_uyvy); \
+ free_aligned_buffer_page_end(orig_y); \
+ free_aligned_buffer_page_end(orig_u); \
+ free_aligned_buffer_page_end(orig_v); \
+ free_aligned_buffer_page_end(dst_y_orig); \
+ free_aligned_buffer_page_end(dst_uv_orig); \
+ free_aligned_buffer_page_end(dst_y); \
+ free_aligned_buffer_page_end(dst_uv); \
+ }
TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
-#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- W1280, N, NEG, OFF, FMT_C, BPP_C) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_b + OFF, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
- const int kStrideC = kWidth * BPP_C; \
- align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \
- align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \
- memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
- memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
- FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, \
- src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, \
- dst_argb_c + OFF, kStrideC, \
- kWidth, NEG kHeight); \
- /* Convert B to C */ \
- FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, \
- dst_argb_bc + OFF, kStrideC, \
- kWidth, kHeight); \
- for (int i = 0; i < kStrideC * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_bc); \
-}
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ W1280, N, NEG, OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_b + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
+ } \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \
+ kWidth, NEG kHeight); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
+ kWidth, kHeight); \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
-#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
@@ -1774,7 +1808,6 @@ TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I411, 4, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
@@ -1784,78 +1817,107 @@ TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \
-TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
- const int kSizeUV = \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth * kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(src_a, kWidth * kHeight + OFF); \
- align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- src_a[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_a + OFF, kWidth, \
- dst_argb_b + OFF, kStrideB, \
- kWidth, NEG kHeight, ATTEN); \
- } \
- int max_diff = 0; \
- /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
- const int kStrideC = kWidth * BPP_C; \
- align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \
- align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \
- memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
- memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
- FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_a + OFF, kWidth, \
- dst_argb_c + OFF, kStrideC, \
- kWidth, NEG kHeight, ATTEN); \
- /* Convert B to C */ \
- FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, \
- dst_argb_bc + OFF, kStrideC, \
- kWidth, kHeight); \
- for (int i = 0; i < kStrideC * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(src_a); \
- free_aligned_buffer_page_end(dst_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_bc); \
-}
+ W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ src_a[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \
+ } \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ FMT_PLANAR##To##FMT_C( \
+ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \
+ dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
+ kWidth, kHeight); \
+ for (int i = 0; i < kStrideC * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(src_a); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
-#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- FMT_C, BPP_C) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \
- TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
- benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ FMT_C, BPP_C) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \
+ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
+ // 2x2 frames
+ uint32_t src[4];
+ uint32_t dst[4];
+ // some random input
+ src[0] = 0x11000000;
+ src[1] = 0x00450000;
+ src[2] = 0x00009f00;
+ src[3] = 0x000000ff;
+ // zeros on destination
+ dst[0] = 0x00000000;
+ dst[1] = 0x00000000;
+ dst[2] = 0x00000000;
+ dst[3] = 0x00000000;
+
+ int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src),
+ 16, // input size
+ reinterpret_cast<uint8_t*>(dst),
+ 8, // destination stride
+ 0, // crop_x
+ 0, // crop_y
+ 2, // width
+ 2, // height
+ 2, // crop width
+ 2, // crop height
+ kRotate90, FOURCC_ARGB);
+
+ EXPECT_EQ(r, 0);
+ // 90 degrees rotation, no conversion
+ EXPECT_EQ(dst[0], src[2]);
+ EXPECT_EQ(dst[1], src[0]);
+ EXPECT_EQ(dst[2], src[3]);
+ EXPECT_EQ(dst[3], src[1]);
+}
+
} // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
index 0cd06f9b..048ed31a 100644
--- a/files/unit_test/cpu_test.cc
+++ b/files/unit_test/cpu_test.cc
@@ -11,10 +11,10 @@
#include <stdlib.h>
#include <string.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/version.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
@@ -45,10 +45,14 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has FMA3 %x\n", has_fma3);
int has_avx3 = TestCpuFlag(kCpuHasAVX3);
printf("Has AVX3 %x\n", has_avx3);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
+ printf("Has F16C %x\n", has_f16c);
int has_mips = TestCpuFlag(kCpuHasMIPS);
printf("Has MIPS %x\n", has_mips);
int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
printf("Has DSPR2 %x\n", has_dspr2);
+ int has_msa = TestCpuFlag(kCpuHasMSA);
+ printf("Has MSA %x\n", has_msa);
}
TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
@@ -62,19 +66,20 @@ TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
printf("x64 build\n");
#endif
#ifdef _MSC_VER
-printf("_MSC_VER %d\n", _MSC_VER);
+ printf("_MSC_VER %d\n", _MSC_VER);
#endif
-#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
- defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(VISUALC_HAS_AVX2))
printf("Has AVX2 1\n");
#else
printf("Has AVX2 0\n");
- // If compiler does not support AVX2, the following function not expected:
+// If compiler does not support AVX2, the following function not expected:
#endif
}
-#if defined(__i386__) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_X64)
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
TEST_F(LibYUVBaseTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
@@ -96,7 +101,7 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
cpu_info[3] = 0;
printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
cpu_info[0], cpu_info[1], cpu_info[2]);
- EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+ EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
// CPU Family and Model
// 3:0 - Stepping
@@ -108,8 +113,8 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
CpuId(1, 0, cpu_info);
int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
- printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
- model, model);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
+ model);
}
}
#endif
diff --git a/files/unit_test/math_test.cc b/files/unit_test/math_test.cc
index 19af9f6b..2b4b57b1 100644
--- a/files/unit_test/math_test.cc
+++ b/files/unit_test/math_test.cc
@@ -12,11 +12,11 @@
#include <string.h>
#include <time.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
index bc0eebb5..2d53cc02 100644
--- a/files/unit_test/planar_test.cc
+++ b/files/unit_test/planar_test.cc
@@ -90,11 +90,11 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
- EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
+ EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
- EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
+ EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
free_aligned_buffer_page_end(atten2_pixels);
@@ -103,9 +103,13 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
free_aligned_buffer_page_end(orig_pixels);
}
-static int TestAttenuateI(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestAttenuateI(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -121,20 +125,17 @@ static int TestAttenuateI(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBAttenuate(src_argb + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBAttenuate(src_argb + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBAttenuate(src_argb + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBAttenuate(src_argb + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -147,39 +148,39 @@ static int TestAttenuateI(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
- int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1);
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
- int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0);
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
- int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ int max_diff =
+ TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 2);
}
-static int TestUnattenuateI(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestUnattenuateI(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -191,27 +192,23 @@ static int TestUnattenuateI(int width, int height, int benchmark_iterations,
for (int i = 0; i < kStride * height; ++i) {
src_argb[i + off] = (fastrand() & 0xff);
}
- ARGBAttenuate(src_argb + off, kStride,
- src_argb + off, kStride,
- width, height);
+ ARGBAttenuate(src_argb + off, kStride, src_argb + off, kStride, width,
+ height);
memset(dst_argb_c, 0, kStride * height);
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBUnattenuate(src_argb + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBUnattenuate(src_argb + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBUnattenuate(src_argb + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBUnattenuate(src_argb + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -224,33 +221,29 @@ static int TestUnattenuateI(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 2);
}
@@ -268,8 +261,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
}
ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
- &added_pixels[0][0][0], 16 * 4,
- 16, 16);
+ &added_pixels[0][0][0], 16 * 4, 16, 16);
for (int y = 0; y < 16; ++y) {
for (int x = 0; x < 16; ++x) {
@@ -503,10 +495,8 @@ TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
// Matrix for Sepia.
SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
- 17 / 2, 68 / 2, 35 / 2, 0,
- 22 / 2, 88 / 2, 45 / 2, 0,
- 24 / 2, 98 / 2, 50 / 2, 0,
- 0, 0, 0, 64, // Copy alpha.
+ 17 / 2, 68 / 2, 35 / 2, 0, 22 / 2, 88 / 2, 45 / 2, 0,
+ 24 / 2, 98 / 2, 50 / 2, 0, 0, 0, 0, 64, // Copy alpha.
};
memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -579,10 +569,8 @@ TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
// Matrix for Sepia.
SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
- 17, 68, 35, 0,
- 22, 88, 45, 0,
- 24, 98, 50, 0,
- 0, 0, 0, 0, // Unused but makes matrix 16 bytes.
+ 17, 68, 35, 0, 22, 88, 45, 0,
+ 24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes.
};
memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -642,10 +630,7 @@ TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
// Matrix for Sepia.
static const uint8 kARGBTable[256 * 4] = {
- 1u, 2u, 3u, 4u,
- 5u, 6u, 7u, 8u,
- 9u, 10u, 11u, 12u,
- 13u, 14u, 15u, 16u,
+ 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
};
orig_pixels[0][0] = 0u;
@@ -701,10 +686,7 @@ TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
// Matrix for Sepia.
static const uint8 kARGBTable[256 * 4] = {
- 1u, 2u, 3u, 4u,
- 5u, 6u, 7u, 8u,
- 9u, 10u, 11u, 12u,
- 13u, 14u, 15u, 16u,
+ 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
};
orig_pixels[0][0] = 0u;
@@ -762,8 +744,8 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- ARGBQuantize(&orig_pixels[0][0], 0,
- (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
+ ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0,
+ 1280, 1);
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
@@ -772,8 +754,8 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
EXPECT_EQ(i & 255, orig_pixels[i][3]);
}
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
- ARGBQuantize(&orig_pixels[0][0], 0,
- (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
+ ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0,
+ 1280, 1);
}
}
@@ -1020,48 +1002,45 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
}
}
-#define TESTTERP(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- W1280, TERP, N, NEG, OFF) \
-TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF); \
- align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \
- for (int i = 0; i < kStrideA * kHeight; ++i) { \
- src_argb_a[i + OFF] = (fastrand() & 0xff); \
- src_argb_b[i + OFF] = (fastrand() & 0xff); \
- } \
- MaskCpuFlags(disable_cpu_flags_); \
- ARGBInterpolate(src_argb_a + OFF, kStrideA, \
- src_argb_b + OFF, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight, TERP); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- ARGBInterpolate(src_argb_a + OFF, kStrideA, \
- src_argb_b + OFF, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight, TERP); \
- } \
- for (int i = 0; i < kStrideB * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
- } \
- free_aligned_buffer_page_end(src_argb_a); \
- free_aligned_buffer_page_end(src_argb_b); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
-}
-
-#define TESTINTERPOLATE(TERP) \
- TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \
- TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
- TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \
- TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
+#define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \
+ N, NEG, OFF) \
+ TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = \
+ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = \
+ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \
+ for (int i = 0; i < kStrideA * kHeight; ++i) { \
+ src_argb_a[i + OFF] = (fastrand() & 0xff); \
+ src_argb_b[i + OFF] = (fastrand() & 0xff); \
+ } \
+ MaskCpuFlags(disable_cpu_flags_); \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \
+ dst_argb_c, kStrideB, kWidth, NEG kHeight, TERP); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \
+ dst_argb_opt, kStrideB, kWidth, NEG kHeight, TERP); \
+ } \
+ for (int i = 0; i < kStrideB * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \
+ } \
+ free_aligned_buffer_page_end(src_argb_a); \
+ free_aligned_buffer_page_end(src_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTINTERPOLATE(TERP) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
TESTINTERPOLATE(0)
TESTINTERPOLATE(64)
@@ -1069,9 +1048,13 @@ TESTINTERPOLATE(128)
TESTINTERPOLATE(192)
TESTINTERPOLATE(255)
-static int TestBlend(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestBlend(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1093,22 +1076,17 @@ static int TestBlend(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 255, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBBlend(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBBlend(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1121,36 +1099,40 @@ static int TestBlend(int width, int height, int benchmark_iterations,
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
- int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
- int max_diff = TestBlend(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
- int max_diff = TestBlend(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
- int max_diff = TestBlend(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
-static void TestBlendPlane(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static void TestBlendPlane(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1170,21 +1152,15 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations,
src_argb_b[i + off] = 255 - (i & 255);
}
memset(src_argb_alpha + off, 255, width);
- BlendPlane(src_argb_a + off, width,
- src_argb_b + off, width,
- src_argb_alpha + off, width,
- dst_argb_opt + off, width,
- width, 1);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
for (int i = 0; i < width; ++i) {
EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
}
// Test destination is maintained exactly if alpha is 0.
memset(src_argb_alpha + off, 0, width);
- BlendPlane(src_argb_a + off, width,
- src_argb_b + off, width,
- src_argb_alpha + off, width,
- dst_argb_opt + off, width,
- width, 1);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
for (int i = 0; i < width; ++i) {
EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
}
@@ -1195,18 +1171,14 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations,
}
MaskCpuFlags(disable_cpu_flags);
- BlendPlane(src_argb_a + off, width,
- src_argb_b + off, width,
- src_argb_alpha + off, width,
- dst_argb_c + off, width,
- width, height);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_c + off, width, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- BlendPlane(src_argb_a + off, width,
- src_argb_b + off, width,
- src_argb_alpha + off, width,
- dst_argb_opt + off, width,
- width, height);
+ BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
+ src_argb_alpha + off, width, dst_argb_opt + off, width, width,
+ invert * height);
}
for (int i = 0; i < kStride * height; ++i) {
EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
@@ -1236,11 +1208,15 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
}
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
-static void TestI420Blend(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static void TestI420Blend(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
width = ((width) > 0) ? (width) : 1;
const int kStrideUV = SUBSAMPLE(width, 2);
const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2);
@@ -1273,30 +1249,18 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
memset(dst_v_opt, 255, kSizeUV + off);
MaskCpuFlags(disable_cpu_flags);
- I420Blend(src_y0 + off, width,
- src_u0 + off, kStrideUV,
- src_v0 + off, kStrideUV,
- src_y1 + off, width,
- src_u1 + off, kStrideUV,
- src_v1 + off, kStrideUV,
- src_a + off, width,
- dst_y_c + off, width,
- dst_u_c + off, kStrideUV,
- dst_v_c + off, kStrideUV,
- width, height);
+ I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off,
+ kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV,
+ src_v1 + off, kStrideUV, src_a + off, width, dst_y_c + off, width,
+ dst_u_c + off, kStrideUV, dst_v_c + off, kStrideUV, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- I420Blend(src_y0 + off, width,
- src_u0 + off, kStrideUV,
- src_v0 + off, kStrideUV,
- src_y1 + off, width,
- src_u1 + off, kStrideUV,
- src_v1 + off, kStrideUV,
- src_a + off, width,
- dst_y_opt + off, width,
- dst_u_opt + off, kStrideUV,
- dst_v_opt + off, kStrideUV,
- width, height);
+ I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off,
+ kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV,
+ src_v1 + off, kStrideUV, src_a + off, width, dst_y_opt + off,
+ width, dst_u_opt + off, kStrideUV, dst_v_opt + off, kStrideUV,
+ width, invert * height);
}
for (int i = 0; i < width * height; ++i) {
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
@@ -1323,21 +1287,21 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
}
TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
}
// TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable.
TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
}
TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
}
TEST_F(LibYUVPlanarTest, TestAffine) {
@@ -1350,10 +1314,10 @@ TEST_F(LibYUVPlanarTest, TestAffine) {
}
}
- float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
+ float uv_step[4] = {0.f, 0.f, 0.75f, 0.f};
- ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
- uv_step, 1280);
+ ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], uv_step,
+ 1280);
EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
@@ -1411,19 +1375,15 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) {
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
- double c_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
}
- c_time = (get_time() - c_time) / benchmark_iterations_;
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
- double opt_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
}
- opt_time = (get_time() - opt_time) / benchmark_iterations_;
for (i = 0; i < y_plane_size; ++i) {
if (dst_c[i] != dst_opt[i])
@@ -1437,9 +1397,13 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) {
EXPECT_EQ(0, err);
}
-static int TestMultiply(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestMultiply(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1457,22 +1421,17 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBMultiply(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBMultiply(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1486,35 +1445,39 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) {
- int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) {
- int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) {
- int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
-static int TestAdd(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestAdd(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1532,22 +1495,17 @@ static int TestAdd(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBAdd(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBAdd(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt,
+ kStride, width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1560,36 +1518,40 @@ static int TestAdd(int width, int height, int benchmark_iterations,
}
TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
- int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) {
- int max_diff = TestAdd(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) {
- int max_diff = TestAdd(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) {
- int max_diff = TestAdd(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
-static int TestSubtract(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestSubtract(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1607,22 +1569,17 @@ static int TestSubtract(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBSubtract(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c,
+ kStride, width, invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBSubtract(src_argb_a + off, kStride,
- src_argb_b + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride,
+ dst_argb_opt, kStride, width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1636,35 +1593,39 @@ static int TestSubtract(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) {
- int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) {
- int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) {
- int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 1);
}
-static int TestSobel(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestSobel(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1681,20 +1642,17 @@ static int TestSobel(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBSobel(src_argb_a + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBSobel(src_argb_a + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBSobel(src_argb_a + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBSobel(src_argb_a + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1706,36 +1664,40 @@ static int TestSobel(int width, int height, int benchmark_iterations,
}
TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
- int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) {
- int max_diff = TestSobel(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) {
- int max_diff = TestSobel(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) {
- int max_diff = TestSobel(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
-static int TestSobelToPlane(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestSobelToPlane(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1754,20 +1716,17 @@ static int TestSobelToPlane(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kDstStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBSobelToPlane(src_argb_a + off, kSrcStride,
- dst_argb_c, kDstStride,
- width, invert * height);
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_c, kDstStride, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBSobelToPlane(src_argb_a + off, kSrcStride,
- dst_argb_opt, kDstStride,
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_opt, kDstStride,
width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kDstStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1780,39 +1739,39 @@ static int TestSobelToPlane(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) {
int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) {
int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, -1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) {
int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
-static int TestSobelXY(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off) {
+static int TestSobelXY(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off) {
if (width < 1) {
width = 1;
}
@@ -1829,20 +1788,17 @@ static int TestSobelXY(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBSobelXY(src_argb_a + off, kStride,
- dst_argb_c, kStride,
- width, invert * height);
+ ARGBSobelXY(src_argb_a + off, kStride, dst_argb_c, kStride, width,
+ invert * height);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBSobelXY(src_argb_a + off, kStride,
- dst_argb_opt, kStride,
- width, invert * height);
+ ARGBSobelXY(src_argb_a + off, kStride, dst_argb_opt, kStride, width,
+ invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1855,35 +1811,40 @@ static int TestSobelXY(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) {
- int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) {
- int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) {
- int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+ int max_diff =
+ TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_EQ(0, max_diff);
}
-static int TestBlur(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off, int radius) {
+static int TestBlur(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off,
+ int radius) {
if (width < 1) {
width = 1;
}
@@ -1901,22 +1862,19 @@ static int TestBlur(int width, int height, int benchmark_iterations,
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(disable_cpu_flags);
- ARGBBlur(src_argb_a + off, kStride,
- dst_argb_c, kStride,
- reinterpret_cast<int32*>(dst_cumsum), width * 4,
- width, invert * height, radius);
+ ARGBBlur(src_argb_a + off, kStride, dst_argb_c, kStride,
+ reinterpret_cast<int32*>(dst_cumsum), width * 4, width,
+ invert * height, radius);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBBlur(src_argb_a + off, kStride,
- dst_argb_opt, kStride,
- reinterpret_cast<int32*>(dst_cumsum), width * 4,
- width, invert * height, radius);
+ ARGBBlur(src_argb_a + off, kStride, dst_argb_opt, kStride,
+ reinterpret_cast<int32*>(dst_cumsum), width * 4, width,
+ invert * height, radius);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i]) -
- static_cast<int>(dst_argb_opt[i]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -1930,67 +1888,59 @@ static int TestBlur(int width, int height, int benchmark_iterations,
static const int kBlurSize = 55;
TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
- int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, kBlurSize);
+ int max_diff =
+ TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1, kBlurSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0, kBlurSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, kBlurSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
EXPECT_LE(max_diff, 1);
}
static const int kBlurSmallSize = 5;
TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
- int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, kBlurSmallSize);
+ int max_diff =
+ TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1, kBlurSmallSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0, kBlurSmallSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
- int max_diff = TestBlur(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, kBlurSmallSize);
+ int max_diff =
+ TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
EXPECT_LE(max_diff, 1);
}
@@ -2001,10 +1951,10 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
memset(orig_pixels, 0, sizeof(orig_pixels));
SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
- 0.94230f, -3.03300f, -2.92500f, 0.f, // C0
- 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
- 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
- 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
+ 0.94230f, -3.03300f, -2.92500f, 0.f, // C0
+ 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
+ 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
+ 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
};
// Test blue
@@ -2081,6 +2031,139 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
}
}
+int TestHalfFloatPlane(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ float scale,
+ int mask) {
+ int i, j;
+ const int y_plane_size = benchmark_width * benchmark_height * 2;
+
+ align_buffer_page_end(orig_y, y_plane_size * 3);
+ uint8* dst_opt = orig_y + y_plane_size;
+ uint8* dst_c = orig_y + y_plane_size * 2;
+
+ MemRandomize(orig_y, y_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 1, y_plane_size);
+
+ for (i = 0; i < y_plane_size / 2; ++i) {
+ reinterpret_cast<uint16*>(orig_y)[i] &= mask;
+ }
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags);
+ for (j = 0; j < benchmark_iterations; j++) {
+ HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
+ reinterpret_cast<uint16*>(dst_c), benchmark_width * 2, scale,
+ benchmark_width, benchmark_height);
+ }
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info);
+ for (j = 0; j < benchmark_iterations; j++) {
+ HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
+ reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2,
+ scale, benchmark_width, benchmark_height);
+ }
+
+ int max_diff = 0;
+ for (i = 0; i < y_plane_size / 2; ++i) {
+ int abs_diff = abs(static_cast<int>(reinterpret_cast<uint16*>(dst_c)[i]) -
+ static_cast<int>(reinterpret_cast<uint16*>(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+#if defined(__arm__)
+static void EnableFlushDenormalToZero(void) {
+ uint32_t cw;
+ __asm__ __volatile__(
+ "vmrs %0, fpscr \n"
+ "orr %0, %0, #0x1000000 \n"
+ "vmsr fpscr, %0 \n"
+ : "=r"(cw)::"memory");
+}
+#endif
+
+// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
+// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
+// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
+// 32 bit arm rounding on denormal case is off by 1 compared to C.
+#if defined(__arm__)
+ EnableFlushDenormalToZero();
+#endif
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 65536.0f, 65535);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f, 65535);
+ EXPECT_LE(diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 4096.0f, 65535);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 1024.0f, 1023);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 512.0f, 511);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 4096.0f, 4095);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f / 4095.0f, 4095);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f, 2047);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
+ int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f, 4095);
+ EXPECT_LE(diff, 1);
+}
+
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
@@ -2170,15 +2253,13 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
memcpy(dst_pixels_c, dst_pixels_opt, kSize);
MaskCpuFlags(disable_cpu_flags_);
- ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
- dst_pixels_c, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
- ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
- dst_pixels_opt, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < kSize; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
@@ -2200,15 +2281,13 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
memcpy(dst_pixels_c, dst_pixels_opt, kPixels);
MaskCpuFlags(disable_cpu_flags_);
- ARGBExtractAlpha(src_pixels, benchmark_width_ * 4,
- dst_pixels_c, benchmark_width_,
- benchmark_width_, benchmark_height_);
+ ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
- ARGBExtractAlpha(src_pixels, benchmark_width_ * 4,
- dst_pixels_opt, benchmark_width_,
- benchmark_width_, benchmark_height_);
+ ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
@@ -2230,15 +2309,13 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4);
MaskCpuFlags(disable_cpu_flags_);
- ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
- dst_pixels_c, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
- ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
- dst_pixels_opt, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_);
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
@@ -2249,9 +2326,14 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
free_aligned_buffer_page_end(orig_pixels);
}
-static int TestARGBRect(int width, int height, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info,
- int invert, int off, int bpp) {
+static int TestARGBRect(int width,
+ int height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ int invert,
+ int off,
+ int bpp) {
if (width < 1) {
width = 1;
}
@@ -2282,9 +2364,8 @@ static int TestARGBRect(int width, int height, int benchmark_iterations,
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
- int abs_diff =
- abs(static_cast<int>(dst_argb_c[i + off]) -
- static_cast<int>(dst_argb_opt[i + off]));
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i + off]) -
+ static_cast<int>(dst_argb_opt[i + off]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -2296,66 +2377,145 @@ static int TestARGBRect(int width, int height, int benchmark_iterations,
TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, 4);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0, 4);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1, 4);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 4);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBRect_Invert) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0, 4);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 4);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, 4);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, SetPlane_Any) {
int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, 1);
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, +1, 0, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 1, 1);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, SetPlane_Invert) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- -1, 0, 1);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
EXPECT_EQ(0, max_diff);
}
TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
- int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_,
- +1, 0, 1);
+ int max_diff =
+ TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_EQ(0, max_diff);
}
+TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2);
+ align_buffer_page_end(tmp_pixels_u, kPixels);
+ align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(tmp_pixels_u, kPixels);
+ MemRandomize(tmp_pixels_v, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 2);
+ align_buffer_page_end(tmp_pixels_u, kPixels);
+ align_buffer_page_end(tmp_pixels_v, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+ MemRandomize(src_pixels, kPixels * 2);
+ MemRandomize(tmp_pixels_u, kPixels);
+ MemRandomize(tmp_pixels_v, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 2);
+ MemRandomize(dst_pixels_c, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
+ tmp_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
+ benchmark_width_, tmp_pixels_v, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ }
+ MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
+ benchmark_height_);
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
} // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc
index 9c83c356..d2003895 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/files/unit_test/rotate_argb_test.cc
@@ -10,14 +10,16 @@
#include <stdlib.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/rotate_argb.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
-void TestRotateBpp(int src_width, int src_height,
- int dst_width, int dst_height,
+void TestRotateBpp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
@@ -51,26 +53,22 @@ void TestRotateBpp(int src_width, int src_height,
if (kBpp == 1) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- RotatePlane(src_argb, src_stride_argb,
- dst_argb_c, dst_stride_argb,
+ RotatePlane(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
- RotatePlane(src_argb, src_stride_argb,
- dst_argb_opt, dst_stride_argb,
+ RotatePlane(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
src_width, src_height, mode);
}
} else if (kBpp == 4) {
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- ARGBRotate(src_argb, src_stride_argb,
- dst_argb_c, dst_stride_argb,
+ ARGBRotate(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb,
src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
- ARGBRotate(src_argb, src_stride_argb,
- dst_argb_opt, dst_stride_argb,
+ ARGBRotate(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb,
src_width, src_height, mode);
}
}
@@ -85,112 +83,104 @@ void TestRotateBpp(int src_width, int src_height,
free_aligned_buffer_page_end(src_argb);
}
-static void ARGBTestRotate(int src_width, int src_height,
- int dst_width, int dst_height,
+static void ARGBTestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
- TestRotateBpp(src_width, src_height,
- dst_width, dst_height,
- mode, benchmark_iterations,
- disable_cpu_flags, benchmark_cpu_info, 4);
+ TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
+ benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 4);
}
TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
- ARGBTestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate0, benchmark_iterations_,
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
- ARGBTestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate90, benchmark_iterations_,
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
- ARGBTestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate180, benchmark_iterations_,
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
- ARGBTestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate270, benchmark_iterations_,
+ ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
-static void TestRotatePlane(int src_width, int src_height,
- int dst_width, int dst_height,
+static void TestRotatePlane(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
- TestRotateBpp(src_width, src_height,
- dst_width, dst_height,
- mode, benchmark_iterations,
- disable_cpu_flags, benchmark_cpu_info, 1);
+ TestRotateBpp(src_width, src_height, dst_width, dst_height, mode,
+ benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 1);
}
TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
- TestRotatePlane(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
- TestRotatePlane(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
- TestRotatePlane(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
- TestRotatePlane(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
} // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index 07e2f73a..d04b96e9 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -10,17 +10,20 @@
#include <stdlib.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/rotate.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
-static void I420TestRotate(int src_width, int src_height,
- int dst_width, int dst_height,
+static void I420TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info) {
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (src_width < 1) {
src_width = 1;
}
@@ -50,26 +53,21 @@ static void I420TestRotate(int src_width, int src_height,
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- I420Rotate(src_i420, src_width,
- src_i420 + src_i420_y_size, (src_width + 1) / 2,
- src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
- dst_i420_c, dst_width,
+ I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
+ (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
+ (src_width + 1) / 2, dst_i420_c, dst_width,
dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2,
- src_width, src_height, mode);
+ (dst_width + 1) / 2, src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
- I420Rotate(src_i420, src_width,
- src_i420 + src_i420_y_size, (src_width + 1) / 2,
- src_i420 + src_i420_y_size + src_i420_uv_size,
- (src_width + 1) / 2,
- dst_i420_opt, dst_width,
- dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
- dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2,
- src_width, src_height, mode);
+ I420Rotate(
+ src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
+ src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+ dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
+ (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2, src_width, src_height, mode);
}
// Rotation should be exact.
@@ -83,30 +81,26 @@ static void I420TestRotate(int src_width, int src_height,
}
TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate0, benchmark_iterations_,
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate90, benchmark_iterations_,
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate180, benchmark_iterations_,
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
- I420TestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate270, benchmark_iterations_,
+ I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
@@ -115,37 +109,40 @@ TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
// tested by passing an odd width command line or environment variable.
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
-static void NV12TestRotate(int src_width, int src_height,
- int dst_width, int dst_height,
+static void NV12TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
libyuv::RotationMode mode,
int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info) {
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (src_width < 1) {
src_width = 1;
}
@@ -176,23 +173,19 @@ static void NV12TestRotate(int src_width, int src_height,
memset(dst_i420_opt, 3, dst_i420_size);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
- NV12ToI420Rotate(src_nv12, src_width,
- src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
- dst_i420_c, dst_width,
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_c, dst_width,
dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2,
- src_width, src_height, mode);
+ (dst_width + 1) / 2, src_width, src_height, mode);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (int i = 0; i < benchmark_iterations; ++i) {
- NV12ToI420Rotate(src_nv12, src_width,
- src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
- dst_i420_opt, dst_width,
+ NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+ (src_width + 1) & ~1, dst_i420_opt, dst_width,
dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
- (dst_width + 1) / 2,
- src_width, src_height, mode);
+ (dst_width + 1) / 2, src_width, src_height, mode);
}
// Rotation should be exact.
@@ -206,91 +199,79 @@ static void NV12TestRotate(int src_width, int src_height,
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate0, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate90, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate180, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
- NV12TestRotate(benchmark_width_, benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate270, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate0, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate90, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_width_ - 3, benchmark_height_ - 1,
- kRotate180, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
- benchmark_height_ - 1, benchmark_width_ - 3,
- kRotate270, benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate0, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate90, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_,
- benchmark_width_, benchmark_height_,
- kRotate180, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
- NV12TestRotate(benchmark_width_, -benchmark_height_,
- benchmark_height_, benchmark_width_,
- kRotate270, benchmark_iterations_,
+ NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_);
}
-
-
-
-
} // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc
index f99782f7..d11aec20 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/files/unit_test/scale_argb_test.cc
@@ -11,11 +11,11 @@
#include <stdlib.h>
#include <time.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale_argb.h"
#include "libyuv/video_common.h"
-#include "../unit_test/unit_test.h"
namespace libyuv {
@@ -23,18 +23,22 @@ namespace libyuv {
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int ARGBTestFilter(int src_width, int src_height,
- int dst_width, int dst_height,
- FilterMode f, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info) {
+static int ARGBTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
int i, j;
const int b = 0; // 128 to test for padding/stride.
- int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
- (Abs(src_height) + b * 2) * 4LL;
+ int64 src_argb_plane_size =
+ (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
@@ -59,21 +63,18 @@ static int ARGBTestFilter(int src_width, int src_height,
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
- src_width, src_height,
- dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
- dst_width, dst_height, f);
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
- src_width, src_height,
- dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
- dst_width, dst_height, f);
+ src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
double c_time = get_time();
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
- src_width, src_height,
- dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
- dst_width, dst_height, f);
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
c_time = (get_time() - c_time);
@@ -88,8 +89,8 @@ static int ARGBTestFilter(int src_width, int src_height,
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of C vs OPT
- printf("filter %d - %8d us C - %8d us OPT\n",
- f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
@@ -115,10 +116,14 @@ static int ARGBTestFilter(int src_width, int src_height,
static const int kTileX = 8;
static const int kTileY = 8;
-static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
+static int TileARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
FilterMode filtering) {
for (int y = 0; y < dst_height; y += kTileY) {
for (int x = 0; x < dst_width; x += kTileX) {
@@ -130,11 +135,9 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
if (y + clip_height > dst_height) {
clip_height = dst_height - y;
}
- int r = ARGBScaleClip(src_argb, src_stride_argb,
- src_width, src_height,
- dst_argb, dst_stride_argb,
- dst_width, dst_height,
- x, y, clip_width, clip_height, filtering);
+ int r = ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height, x,
+ y, clip_width, clip_height, filtering);
if (r) {
return r;
}
@@ -143,16 +146,19 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
return 0;
}
-static int ARGBClipTestFilter(int src_width, int src_height,
- int dst_width, int dst_height,
- FilterMode f, int benchmark_iterations) {
+static int ARGBClipTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
const int b = 128;
- int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
- (Abs(src_height) + b * 2) * 4;
+ int64 src_argb_plane_size =
+ (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
align_buffer_page_end(src_argb, src_argb_plane_size);
@@ -184,9 +190,8 @@ static int ARGBClipTestFilter(int src_width, int src_height,
// Do full image, no clipping.
double c_time = get_time();
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
- src_width, src_height,
- dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
- dst_width, dst_height, f);
+ src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4,
+ dst_stride_argb, dst_width, dst_height, f);
c_time = (get_time() - c_time);
// Do tiled image, clipping scale to a tile at a time.
@@ -200,8 +205,8 @@ static int ARGBClipTestFilter(int src_width, int src_height,
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of Full vs Tiled.
- printf("filter %d - %8d us Full - %8d us Tiled\n",
- f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+ printf("filter %d - %8d us Full - %8d us Tiled\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// Compare full scaled image vs tiled image.
int max_diff = 0;
@@ -226,32 +231,30 @@ static int ARGBClipTestFilter(int src_width, int src_height,
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
- int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom), \
- SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), \
- DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
- int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom), \
- SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), \
- DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- }
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \
+ int diff = ARGBTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \
+ int diff = ARGBClipTestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, 3) \
- TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
- TEST_FACTOR1(name, Box, nom, denom, 3)
+#define TEST_FACTOR(name, nom, denom) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(name, Box, nom, denom, 3)
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
@@ -265,39 +268,37 @@ TEST_FACTOR(3, 1, 3)
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
- TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
- int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
- width, height, \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
- int diff = ARGBTestFilter(width, height, \
- Abs(benchmark_width_), Abs(benchmark_height_), \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
- int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
- width, height, \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
- int diff = ARGBClipTestFilter(width, height, \
- Abs(benchmark_width_), \
- Abs(benchmark_height_), \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- }
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \
+ int diff = \
+ ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \
+ int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
/// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height) \
- TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 3) \
- TEST_SCALETO1(name, width, height, Bilinear, 3)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3)
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
@@ -310,31 +311,33 @@ TEST_SCALETO(ARGBScale, 1280, 720)
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
-int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint32 src_fourcc,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- uint32 dst_fourcc,
- int dst_width, int dst_height,
- int clip_x, int clip_y,
- int clip_width, int clip_height,
+int YUVToARGBScaleReference2(const uint8* src_y,
+ int src_stride_y,
+ const uint8* src_u,
+ int src_stride_u,
+ const uint8* src_v,
+ int src_stride_v,
+ uint32 /* src_fourcc */, // TODO: Add support.
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ uint32 /* dst_fourcc */, // TODO: Add support.
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
int r;
- I420ToARGB(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- argb_buffer, src_width * 4,
- src_width, src_height);
-
- r = ARGBScaleClip(argb_buffer, src_width * 4,
- src_width, src_height,
- dst_argb, dst_stride_argb,
- dst_width, dst_height,
- clip_x, clip_y, clip_width, clip_height,
- filtering);
+ I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ argb_buffer, src_width * 4, src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}
@@ -360,13 +363,15 @@ static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
}
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int YUVToARGBTestFilter(int src_width, int src_height,
- int dst_width, int dst_height,
- FilterMode f, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info) {
+static int YUVToARGBTestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
- int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
- ((Abs(src_height) + 1) / 2);
+ int64 src_uv_plane_size =
+ ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
int src_stride_y = Abs(src_width);
int src_stride_uv = (Abs(src_width) + 1) / 2;
@@ -374,8 +379,8 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
align_buffer_page_end(src_u, src_uv_plane_size);
align_buffer_page_end(src_v, src_uv_plane_size);
- int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
- int dst_stride_argb = (dst_width) * 4;
+ int64 dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
+ int dst_stride_argb = (dst_width)*4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
@@ -390,28 +395,18 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
- YUVToARGBScaleReference2(src_y, src_stride_y,
- src_u, src_stride_uv,
- src_v, src_stride_uv,
- libyuv::FOURCC_I420,
- src_width, src_height,
- dst_argb_c, dst_stride_argb,
- libyuv::FOURCC_I420,
- dst_width, dst_height,
- 0, 0, dst_width, dst_height,
- f);
+ YUVToARGBScaleReference2(src_y, src_stride_y, src_u, src_stride_uv, src_v,
+ src_stride_uv, libyuv::FOURCC_I420, src_width,
+ src_height, dst_argb_c, dst_stride_argb,
+ libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
+ dst_width, dst_height, f);
for (int i = 0; i < benchmark_iterations; ++i) {
- YUVToARGBScaleClip(src_y, src_stride_y,
- src_u, src_stride_uv,
- src_v, src_stride_uv,
- libyuv::FOURCC_I420,
- src_width, src_height,
- dst_argb_opt, dst_stride_argb,
- libyuv::FOURCC_I420,
- dst_width, dst_height,
- 0, 0, dst_width, dst_height,
- f);
+ YUVToARGBScaleClip(src_y, src_stride_y, src_u, src_stride_uv, src_v,
+ src_stride_uv, libyuv::FOURCC_I420, src_width,
+ src_height, dst_argb_opt, dst_stride_argb,
+ libyuv::FOURCC_I420, dst_width, dst_height, 0, 0,
+ dst_width, dst_height, f);
}
int max_diff = 0;
for (int i = 0; i < dst_height; ++i) {
@@ -419,9 +414,7 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff) {
- printf("error %d at %d,%d c %d opt %d",
- abs_diff,
- j, i,
+ printf("error %d at %d,%d c %d opt %d", abs_diff, j, i,
dst_argb_c[(i * dst_stride_argb) + j],
dst_argb_opt[(i * dst_stride_argb) + j]);
EXPECT_LE(abs_diff, 40);
@@ -439,24 +432,18 @@ static int YUVToARGBTestFilter(int src_width, int src_height,
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
- int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
- benchmark_width_ * 3 / 2,
- benchmark_height_ * 3 / 2,
- libyuv::kFilterBilinear,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ int diff =
+ YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+ benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
+ libyuv::kFilterBilinear, benchmark_iterations_);
EXPECT_LE(diff, 10);
}
TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
- int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
- benchmark_height_ * 3 / 2,
- benchmark_width_, benchmark_height_,
- libyuv::kFilterBilinear,
- benchmark_iterations_,
- disable_cpu_flags_, benchmark_cpu_info_);
+ int diff = YUVToARGBTestFilter(
+ benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
+ benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_);
EXPECT_LE(diff, 10);
}
-
} // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc
index f40443e2..0b4ec30b 100644
--- a/files/unit_test/scale_test.cc
+++ b/files/unit_test/scale_test.cc
@@ -11,9 +11,9 @@
#include <stdlib.h>
#include <time.h>
+#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
-#include "../unit_test/unit_test.h"
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
@@ -21,10 +21,14 @@
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int TestFilter(int src_width, int src_height,
- int dst_width, int dst_height,
- FilterMode f, int benchmark_iterations,
- int disable_cpu_flags, int benchmark_cpu_info) {
+static int TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@@ -41,9 +45,8 @@ static int TestFilter(int src_width, int src_height,
int src_stride_uv = b * 2 + src_width_uv;
align_buffer_page_end(src_y, src_y_plane_size)
- align_buffer_page_end(src_u, src_uv_plane_size)
- align_buffer_page_end(src_v, src_uv_plane_size)
- if (!src_y || !src_u || !src_v) {
+ align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end(
+ src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
@@ -61,13 +64,15 @@ static int TestFilter(int src_width, int src_height,
int dst_stride_uv = b * 2 + dst_width_uv;
align_buffer_page_end(dst_y_c, dst_y_plane_size)
- align_buffer_page_end(dst_u_c, dst_uv_plane_size)
- align_buffer_page_end(dst_v_c, dst_uv_plane_size)
- align_buffer_page_end(dst_y_opt, dst_y_plane_size)
- align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
- align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
- if (!dst_y_c || !dst_u_c || !dst_v_c ||
- !dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+ align_buffer_page_end(
+ dst_v_opt,
+ dst_uv_plane_size) if (!dst_y_c || !dst_u_c ||
+ !dst_v_c || !dst_y_opt ||
+ !dst_u_opt || !dst_v_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
@@ -76,12 +81,11 @@ static int TestFilter(int src_width, int src_height,
double c_time = get_time();
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv,
- src_width, src_height,
- dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_width, dst_height, f);
+ dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
@@ -89,19 +93,16 @@ static int TestFilter(int src_width, int src_height,
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv,
- src_width, src_height,
- dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_width, dst_height, f);
+ dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
}
opt_time = (get_time() - opt_time) / benchmark_iterations;
// Report performance of C vs OPT
- printf("filter %d - %8d us C - %8d us OPT\n",
- f,
- static_cast<int>(c_time * 1e6),
- static_cast<int>(opt_time * 1e6));
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
@@ -133,25 +134,27 @@ static int TestFilter(int src_width, int src_height,
}
}
- free_aligned_buffer_page_end(dst_y_c)
- free_aligned_buffer_page_end(dst_u_c)
- free_aligned_buffer_page_end(dst_v_c)
- free_aligned_buffer_page_end(dst_y_opt)
- free_aligned_buffer_page_end(dst_u_opt)
- free_aligned_buffer_page_end(dst_v_opt)
+ free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c)
+ free_aligned_buffer_page_end(dst_v_c)
+ free_aligned_buffer_page_end(dst_y_opt)
+ free_aligned_buffer_page_end(dst_u_opt)
+ free_aligned_buffer_page_end(dst_v_opt)
- free_aligned_buffer_page_end(src_y)
- free_aligned_buffer_page_end(src_u)
- free_aligned_buffer_page_end(src_v)
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
- return max_diff;
+ return max_diff;
}
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
-static int TestFilter_16(int src_width, int src_height,
- int dst_width, int dst_height,
- FilterMode f, int benchmark_iterations) {
+static int TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@@ -161,20 +164,18 @@ static int TestFilter_16(int src_width, int src_height,
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
- int64 src_y_plane_size = (Abs(src_width) + b * 2) *
- (Abs(src_height) + b * 2);
+ int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
int src_stride_y = b * 2 + Abs(src_width);
int src_stride_uv = b * 2 + src_width_uv;
- align_buffer_page_end(src_y, src_y_plane_size)
- align_buffer_page_end(src_u, src_uv_plane_size)
- align_buffer_page_end(src_v, src_uv_plane_size)
- align_buffer_page_end(src_y_16, src_y_plane_size * 2)
- align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
- align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
- uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
+ align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end(
+ src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size)
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2)
+ align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
+ align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
+ uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
@@ -205,34 +206,33 @@ static int TestFilter_16(int src_width, int src_height,
int dst_stride_uv = b * 2 + dst_width_uv;
align_buffer_page_end(dst_y_8, dst_y_plane_size)
- align_buffer_page_end(dst_u_8, dst_uv_plane_size)
- align_buffer_page_end(dst_v_8, dst_uv_plane_size)
- align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
- align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
- align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
-
- uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
+ align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
+ align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
+
+ uint16* p_dst_y_16 =
+ reinterpret_cast<uint16*>(dst_y_16);
uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv,
- src_width, src_height,
- dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
+ src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_width, dst_height, f);
+ dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
for (i = 0; i < benchmark_iterations; ++i) {
I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
- p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
- src_width, src_height,
- p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
+ p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width,
+ src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
- p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_width, dst_height, f);
+ p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
+ dst_height, f);
}
// Expect an exact match
@@ -262,21 +262,20 @@ static int TestFilter_16(int src_width, int src_height,
}
}
- free_aligned_buffer_page_end(dst_y_8)
- free_aligned_buffer_page_end(dst_u_8)
- free_aligned_buffer_page_end(dst_v_8)
- free_aligned_buffer_page_end(dst_y_16)
- free_aligned_buffer_page_end(dst_u_16)
- free_aligned_buffer_page_end(dst_v_16)
-
- free_aligned_buffer_page_end(src_y)
- free_aligned_buffer_page_end(src_u)
- free_aligned_buffer_page_end(src_v)
- free_aligned_buffer_page_end(src_y_16)
- free_aligned_buffer_page_end(src_u_16)
- free_aligned_buffer_page_end(src_v_16)
-
- return max_diff;
+ free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8)
+ free_aligned_buffer_page_end(dst_v_8)
+ free_aligned_buffer_page_end(dst_y_16)
+ free_aligned_buffer_page_end(dst_u_16)
+ free_aligned_buffer_page_end(dst_v_16)
+
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
+ free_aligned_buffer_page_end(src_y_16)
+ free_aligned_buffer_page_end(src_u_16)
+ free_aligned_buffer_page_end(src_v_16)
+
+ return max_diff;
}
// The following adjustments in dimensions ensure the scale factor will be
@@ -285,32 +284,30 @@ static int TestFilter_16(int src_width, int src_height,
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
- int diff = TestFilter(SX(benchmark_width_, nom, denom), \
- SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), \
- DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
- int diff = TestFilter_16(SX(benchmark_width_, nom, denom), \
- SX(benchmark_height_, nom, denom), \
- DX(benchmark_width_, nom, denom), \
- DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- }
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
+ int diff = TestFilter( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
+ int diff = TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
- TEST_FACTOR1(name, None, nom, denom, 0) \
- TEST_FACTOR1(name, Linear, nom, denom, 3) \
- TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
- TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, 3) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+ TEST_FACTOR1(name, Box, nom, denom, boxdiff)
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 0)
@@ -323,42 +320,40 @@ TEST_FACTOR(3, 1, 3, 0)
#undef SX
#undef DX
-#define TEST_SCALETO1(name, width, height, filter, max_diff) \
- TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
- int diff = TestFilter(benchmark_width_, benchmark_height_, \
- width, height, \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
- int diff = TestFilter(width, height, \
- Abs(benchmark_width_), Abs(benchmark_height_), \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, \
- DISABLED_##name##To##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(benchmark_width_, benchmark_height_, \
- width, height, \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- } \
- TEST_F(LibYUVScaleTest, \
- DISABLED_##name##From##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(width, height, \
- Abs(benchmark_width_), Abs(benchmark_height_), \
- kFilter##filter, benchmark_iterations_); \
- EXPECT_LE(diff, max_diff); \
- }
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
+ int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
+ kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
+ int diff = TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##To##width##x##height##_##filter##_16) { \
+ int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, \
+ DISABLED_##name##From##width##x##height##_##filter##_16) { \
+ int diff = TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height) \
- TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 0) \
- TEST_SCALETO1(name, width, height, Bilinear, 0) \
- TEST_SCALETO1(name, width, height, Box, 0)
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 0) \
+ TEST_SCALETO1(name, width, height, Bilinear, 0) \
+ TEST_SCALETO1(name, width, height, Box, 0)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index e75510fd..7f8bcf8f 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -25,18 +25,21 @@ unsigned int fastrand_seed = 0xfb;
DEFINE_int32(libyuv_width, 0, "width of test image.");
DEFINE_int32(libyuv_height, 0, "height of test image.");
DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
-DEFINE_int32(libyuv_flags, 0,
- "cpu flags for reference code. 1 = C, -1 = SIMD");
-DEFINE_int32(libyuv_cpu_info, 0,
+DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD");
+DEFINE_int32(libyuv_cpu_info,
+ 0,
"cpu flags for benchmark code. 1 = C, -1 = SIMD");
// For quicker unittests, default is 128 x 72. But when benchmarking,
// default to 720p. Allow size to specify.
// Set flags to -1 for benchmarking to avoid slower C code.
-LibYUVConvertTest::LibYUVConvertTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVConvertTest::LibYUVConvertTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -76,19 +79,26 @@ LibYUVConvertTest::LibYUVConvertTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
-LibYUVColorTest::LibYUVColorTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVColorTest::LibYUVColorTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -128,19 +138,26 @@ LibYUVColorTest::LibYUVColorTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
-LibYUVScaleTest::LibYUVScaleTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVScaleTest::LibYUVScaleTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -180,19 +197,26 @@ LibYUVScaleTest::LibYUVScaleTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
-LibYUVRotateTest::LibYUVRotateTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVRotateTest::LibYUVRotateTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -232,19 +256,26 @@ LibYUVRotateTest::LibYUVRotateTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
-LibYUVPlanarTest::LibYUVPlanarTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVPlanarTest::LibYUVPlanarTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -284,19 +315,26 @@ LibYUVPlanarTest::LibYUVPlanarTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
-LibYUVBaseTest::LibYUVBaseTest() :
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+LibYUVBaseTest::LibYUVBaseTest()
+ : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -336,14 +374,18 @@ LibYUVBaseTest::LibYUVBaseTest() :
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
- benchmark_pixels_div1280_ = static_cast<int>((
- static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
+ benchmark_pixels_div256_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 255.0) /
+ 256.0);
+ benchmark_pixels_div1280_ =
+ static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) +
+ 1279.0) /
+ 1280.0);
}
int main(int argc, char** argv) {
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index f2c4bef0..f7d60a76 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -14,8 +14,8 @@
#ifdef WIN32
#include <windows.h>
#else
-#include <sys/time.h>
#include <sys/resource.h>
+#include <sys/time.h>
#endif
#include <gtest/gtest.h>
@@ -54,8 +54,10 @@ static __inline int Abs(int v) {
static const int kMaxWidth = 32768;
static const int kMaxHeight = 32768;
-static inline bool SizeValid(int src_width, int src_height,
- int dst_width, int dst_height) {
+static inline bool SizeValid(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
if (src_width > kMaxWidth || src_height > kMaxHeight ||
dst_width > kMaxWidth || dst_height > kMaxHeight) {
printf("Warning - size too large to test. Skipping\n");
@@ -64,15 +66,16 @@ static inline bool SizeValid(int src_width, int src_height,
return true;
}
-#define align_buffer_page_end(var, size) \
- uint8* var; \
- uint8* var##_mem; \
- var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
- var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
- (size)) & ~63);
+#define align_buffer_page_end(var, size) \
+ uint8* var; \
+ uint8* var##_mem; \
+ var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
+ var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
+ (size)) & \
+ ~63);
#define free_aligned_buffer_page_end(var) \
- free(var##_mem); \
+ free(var##_mem); \
var = 0;
#ifdef WIN32
@@ -122,78 +125,78 @@ class LibYUVColorTest : public ::testing::Test {
protected:
LibYUVColorTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVConvertTest : public ::testing::Test {
protected:
LibYUVConvertTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVScaleTest : public ::testing::Test {
protected:
LibYUVScaleTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVRotateTest : public ::testing::Test {
protected:
LibYUVRotateTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVPlanarTest : public ::testing::Test {
protected:
LibYUVPlanarTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
class LibYUVBaseTest : public ::testing::Test {
protected:
LibYUVBaseTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
- int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
- int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
};
#endif // UNIT_TEST_UNIT_TEST_H_ NOLINT
diff --git a/files/unit_test/video_common_test.cc b/files/unit_test/video_common_test.cc
index ac97d0f3..f16b6772 100644
--- a/files/unit_test/video_common_test.cc
+++ b/files/unit_test/video_common_test.cc
@@ -11,26 +11,23 @@
#include <stdlib.h>
#include <string.h>
-#include "libyuv/video_common.h"
#include "../unit_test/unit_test.h"
+#include "libyuv/video_common.h"
namespace libyuv {
// Tests FourCC codes in video common, which are used for ConvertToI420().
static bool TestValidChar(uint32 onecc) {
- if ((onecc >= '0' && onecc <= '9') ||
- (onecc >= 'A' && onecc <= 'Z') ||
- (onecc >= 'a' && onecc <= 'z') ||
- (onecc == ' ') || (onecc == 0xff)) {
+ if ((onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') ||
+ (onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff)) {
return true;
}
return false;
}
static bool TestValidFourCC(uint32 fourcc, int bpp) {
- if (!TestValidChar(fourcc & 0xff) ||
- !TestValidChar((fourcc >> 8) & 0xff) ||
+ if (!TestValidChar(fourcc & 0xff) || !TestValidChar((fourcc >> 8) & 0xff) ||
!TestValidChar((fourcc >> 16) & 0xff) ||
!TestValidChar((fourcc >> 24) & 0xff)) {
return false;
@@ -42,23 +39,23 @@ static bool TestValidFourCC(uint32 fourcc, int bpp) {
}
TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
- EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
- EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_YU12));
- EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
- EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
- EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
- EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUVS));
- EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_HDYC));
- EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
- EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
- EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
- EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_RGB3));
- EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
- EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
- EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_CM24));
- EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_L555));
- EXPECT_EQ(FOURCC_RGBP, CanonicalFourCC(FOURCC_L565));
- EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
+ EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
}
TEST_F(LibYUVBaseTest, TestFourCC) {
@@ -66,7 +63,6 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
- EXPECT_TRUE(TestValidFourCC(FOURCC_I411, FOURCC_BPP_I411));
EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
@@ -78,7 +74,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
- EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
@@ -101,7 +97,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
- EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
}
} // namespace libyuv
diff --git a/files/util/compare.cc b/files/util/compare.cc
index c36c0fa5..ef0beefa 100644
--- a/files/util/compare.cc
+++ b/files/util/compare.cc
@@ -39,10 +39,12 @@ int main(int argc, char** argv) {
int amt2 = 0;
do {
amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
- if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+ if (amt1 > 0)
+ hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
if (fin2) {
amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
- if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+ if (amt2 > 0)
+ hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
int amt_min = (amt1 < amt2) ? amt1 : amt2;
size_min += amt_min;
sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
@@ -52,8 +54,8 @@ int main(int argc, char** argv) {
printf("hash1 %x", hash1);
if (fin2) {
printf(", hash2 %x", hash2);
- double mse = static_cast<double>(sum_square_err) /
- static_cast<double>(size_min);
+ double mse =
+ static_cast<double>(sum_square_err) / static_cast<double>(size_min);
printf(", mse %.2f", mse);
double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
printf(", psnr %.2f\n", psnr);
diff --git a/files/util/convert.cc b/files/util/convert.cc
index 5f071416..acaf43ad 100644
--- a/files/util/convert.cc
+++ b/files/util/convert.cc
@@ -29,13 +29,13 @@ bool verbose = false;
bool attenuate = false;
bool unattenuate = false;
int image_width = 0, image_height = 0; // original width and height
-int dst_width = 0, dst_height = 0; // new width and height
+int dst_width = 0, dst_height = 0; // new width and height
int fileindex_org = 0; // argv argument contains the original file name.
int fileindex_rec = 0; // argv argument contains the reconstructed file name.
-int num_rec = 0; // Number of reconstructed images.
-int num_skip_org = 0; // Number of frames to skip in original.
-int num_frames = 0; // Number of frames to convert.
-int filter = 1; // Bilinear filter for scaling.
+int num_rec = 0; // Number of reconstructed images.
+int num_skip_org = 0; // Number of frames to skip in original.
+int num_frames = 0; // Number of frames to convert.
+int filter = 1; // Bilinear filter for scaling.
static __inline uint32 Abs(int32 v) {
return v >= 0 ? v : -v;
@@ -48,8 +48,8 @@ bool ExtractResolutionFromFilename(const char* name,
// Isolate the .width_height. section of the filename by searching for a
// dot or underscore followed by a digit.
for (int i = 0; name[i]; ++i) {
- if ((name[i] == '.' || name[i] == '_') &&
- name[i + 1] >= '0' && name[i + 1] <= '9') {
+ if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
+ name[i + 1] <= '9') {
int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
if (2 == n) {
return true;
@@ -59,13 +59,14 @@ bool ExtractResolutionFromFilename(const char* name,
return false;
}
-void PrintHelp(const char * program) {
+void PrintHelp(const char* program) {
printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
- printf(" -s <width> <height> .... specify source resolution. "
- "Optional if name contains\n"
- " resolution (ie. "
- "name.1920x800_24Hz_P420.yuv)\n"
- " Negative value mirrors.\n");
+ printf(
+ " -s <width> <height> .... specify source resolution. "
+ "Optional if name contains\n"
+ " resolution (ie. "
+ "name.1920x800_24Hz_P420.yuv)\n"
+ " Negative value mirrors.\n");
printf(" -d <width> <height> .... specify destination resolution.\n");
printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
@@ -78,7 +79,8 @@ void PrintHelp(const char * program) {
}
void ParseOptions(int argc, const char* argv[]) {
- if (argc <= 1) PrintHelp(argv[0]);
+ if (argc <= 1)
+ PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@@ -89,17 +91,17 @@ void ParseOptions(int argc, const char* argv[]) {
} else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
PrintHelp(argv[0]);
} else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
- image_width = atoi(argv[++c]); // NOLINT
- image_height = atoi(argv[++c]); // NOLINT
+ image_width = atoi(argv[++c]); // NOLINT
+ image_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
- dst_width = atoi(argv[++c]); // NOLINT
- dst_height = atoi(argv[++c]); // NOLINT
+ dst_width = atoi(argv[++c]); // NOLINT
+ dst_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
- num_skip_org = atoi(argv[++c]); // NOLINT
+ num_skip_org = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
- num_frames = atoi(argv[++c]); // NOLINT
+ num_frames = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
- filter = atoi(argv[++c]); // NOLINT
+ filter = atoi(argv[++c]); // NOLINT
} else if (argv[c][0] == '-') {
fprintf(stderr, "Unknown option. %s\n", argv[c]);
} else if (fileindex_org == 0) {
@@ -127,11 +129,9 @@ void ParseOptions(int argc, const char* argv[]) {
int org_width, org_height;
int rec_width, rec_height;
bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
- &org_width,
- &org_height);
+ &org_width, &org_height);
bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
- &rec_width,
- &rec_height);
+ &rec_width, &rec_height);
if (image_width == 0 || image_height == 0) {
if (org_res_avail) {
image_width = org_width;
@@ -158,10 +158,14 @@ void ParseOptions(int argc, const char* argv[]) {
static const int kTileX = 32;
static const int kTileY = 32;
-static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- int dst_width, int dst_height,
+static int TileARGBScale(const uint8* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
libyuv::FilterMode filtering) {
for (int y = 0; y < dst_height; y += kTileY) {
for (int x = 0; x < dst_width; x += kTileX) {
@@ -173,11 +177,10 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
if (y + clip_height > dst_height) {
clip_height = dst_height - y;
}
- int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb,
- src_width, src_height,
- dst_argb, dst_stride_argb,
- dst_width, dst_height,
- x, y, clip_width, clip_height, filtering);
+ int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
+ src_height, dst_argb, dst_stride_argb,
+ dst_width, dst_height, x, y, clip_width,
+ clip_height, filtering);
if (r) {
return r;
}
@@ -197,8 +200,8 @@ int main(int argc, const char* argv[]) {
}
// Open all files to convert to
- FILE** file_rec = new FILE* [num_rec];
- memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+ FILE** file_rec = new FILE*[num_rec];
+ memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
if (file_rec[cur_rec] == NULL) {
@@ -222,8 +225,8 @@ int main(int argc, const char* argv[]) {
// Input is YUV
if (org_is_yuv) {
const int y_size = Abs(image_width) * Abs(image_height);
- const int uv_size = ((Abs(image_width) + 1) / 2) *
- ((Abs(image_height) + 1) / 2);
+ const int uv_size =
+ ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2);
org_size = y_size + 2 * uv_size; // YUV original.
}
@@ -233,8 +236,8 @@ int main(int argc, const char* argv[]) {
const size_t total_size = y_size + 2 * uv_size;
#if defined(_MSC_VER)
_fseeki64(file_org,
- static_cast<__int64>(num_skip_org) *
- static_cast<__int64>(org_size), SEEK_SET);
+ static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size),
+ SEEK_SET);
#else
fseek(file_org, num_skip_org * total_size, SEEK_SET);
#endif
@@ -256,18 +259,18 @@ int main(int argc, const char* argv[]) {
}
if (verbose) {
- printf("Size: %dx%d to %dx%d\n", image_width, image_height,
- dst_width, dst_height);
+ printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width,
+ dst_height);
}
int number_of_frames;
- for (number_of_frames = 0; ; ++number_of_frames) {
+ for (number_of_frames = 0;; ++number_of_frames) {
if (num_frames && number_of_frames >= num_frames)
break;
// Load original YUV or ARGB frame.
- size_t bytes_org = fread(ch_org, sizeof(uint8),
- static_cast<size_t>(org_size), file_org);
+ size_t bytes_org =
+ fread(ch_org, sizeof(uint8), static_cast<size_t>(org_size), file_org);
if (bytes_org < static_cast<size_t>(org_size))
break;
@@ -290,22 +293,17 @@ int main(int argc, const char* argv[]) {
int half_src_height = (src_height + 1) / 2;
int half_dst_width = (dst_width + 1) / 2;
int half_dst_height = (dst_height + 1) / 2;
- I420Scale(ch_org, src_width,
- ch_org + src_width * src_height, half_src_width,
- ch_org + src_width * src_height +
- half_src_width * half_src_height, half_src_width,
- image_width, image_height,
- ch_rec, dst_width,
- ch_rec + dst_width * dst_height, half_dst_width,
- ch_rec + dst_width * dst_height +
- half_dst_width * half_dst_height, half_dst_width,
- dst_width, dst_height,
- static_cast<libyuv::FilterMode>(filter));
+ I420Scale(
+ ch_org, src_width, ch_org + src_width * src_height, half_src_width,
+ ch_org + src_width * src_height + half_src_width * half_src_height,
+ half_src_width, image_width, image_height, ch_rec, dst_width,
+ ch_rec + dst_width * dst_height, half_dst_width,
+ ch_rec + dst_width * dst_height + half_dst_width * half_dst_height,
+ half_dst_width, dst_width, dst_height,
+ static_cast<libyuv::FilterMode>(filter));
} else {
- TileARGBScale(ch_org, Abs(image_width) * 4,
- image_width, image_height,
- ch_dst, dst_width * 4,
- dst_width, dst_height,
+ TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height,
+ ch_dst, dst_width * 4, dst_width, dst_height,
static_cast<libyuv::FilterMode>(filter));
}
bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
@@ -321,25 +319,24 @@ int main(int argc, const char* argv[]) {
if (!org_is_yuv && rec_is_yuv) {
int half_width = (dst_width + 1) / 2;
int half_height = (dst_height + 1) / 2;
- libyuv::ARGBToI420(ch_dst, dst_width * 4,
- ch_rec, dst_width,
- ch_rec + dst_width * dst_height, half_width,
- ch_rec + dst_width * dst_height +
- half_width * half_height, half_width,
- dst_width, dst_height);
+ libyuv::ARGBToI420(
+ ch_dst, dst_width * 4, ch_rec, dst_width,
+ ch_rec + dst_width * dst_height, half_width,
+ ch_rec + dst_width * dst_height + half_width * half_height,
+ half_width, dst_width, dst_height);
}
// Output YUV or ARGB frame.
if (rec_is_yuv) {
- size_t bytes_rec = fwrite(ch_rec, sizeof(uint8),
- static_cast<size_t>(total_size),
- file_rec[cur_rec]);
+ size_t bytes_rec =
+ fwrite(ch_rec, sizeof(uint8), static_cast<size_t>(total_size),
+ file_rec[cur_rec]);
if (bytes_rec < static_cast<size_t>(total_size))
break;
} else {
- size_t bytes_rec = fwrite(ch_dst, sizeof(uint8),
- static_cast<size_t>(dst_size),
- file_rec[cur_rec]);
+ size_t bytes_rec =
+ fwrite(ch_dst, sizeof(uint8), static_cast<size_t>(dst_size),
+ file_rec[cur_rec]);
if (bytes_rec < static_cast<size_t>(dst_size))
break;
}
diff --git a/files/util/cpuid.c b/files/util/cpuid.c
index 94e245b1..9716f115 100644
--- a/files/util/cpuid.c
+++ b/files/util/cpuid.c
@@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
int has_avx3 = TestCpuFlag(kCpuHasAVX3);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
printf("Has SSE2 %x\n", has_sse2);
printf("Has SSSE3 %x\n", has_ssse3);
printf("Has SSE4.1 %x\n", has_sse41);
@@ -88,6 +89,7 @@ int main(int argc, const char* argv[]) {
printf("Has AVX3 %x\n", has_avx3);
printf("Has ERMS %x\n", has_erms);
printf("Has FMA3 %x\n", has_fma3);
+ printf("Has F16C %x\n", has_f16c);
}
return 0;
}
diff --git a/files/util/psnr.cc b/files/util/psnr.cc
index 52b04bd5..27f876c0 100644
--- a/files/util/psnr.cc
+++ b/files/util/psnr.cc
@@ -27,7 +27,7 @@ typedef unsigned __int64 uint64;
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
-#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
#endif // __LP64__
#endif // _MSC_VER
@@ -39,85 +39,81 @@ typedef unsigned long long uint64; // NOLINT
!defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
- const uint8* src_b, int count) {
+ const uint8* src_b,
+ int count) {
volatile uint32 sse;
- asm volatile (
- "vmov.u8 q7, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
+ asm volatile(
+ "vmov.u8 q7, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
- "1: \n"
- "vld1.u8 {q0}, [%0]! \n"
- "vld1.u8 {q1}, [%1]! \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q7, d4, d4 \n"
- "vmlal.s16 q8, d6, d6 \n"
- "vmlal.s16 q8, d5, d5 \n"
- "vmlal.s16 q10, d7, d7 \n"
- "subs %2, %2, #16 \n"
- "bhi 1b \n"
+ "1: \n"
+ "vld1.u8 {q0}, [%0]! \n"
+ "vld1.u8 {q1}, [%1]! \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q7, d4, d4 \n"
+ "vmlal.s16 q8, d6, d6 \n"
+ "vmlal.s16 q8, d5, d5 \n"
+ "vmlal.s16 q10, d7, d7 \n"
+ "subs %2, %2, #16 \n"
+ "bhi 1b \n"
- "vadd.u32 q7, q7, q8 \n"
- "vadd.u32 q9, q9, q10 \n"
- "vadd.u32 q10, q7, q9 \n"
- "vpaddl.u32 q1, q10 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
+ "vadd.u32 q7, q7, q8 \n"
+ "vadd.u32 q9, q9, q10 \n"
+ "vadd.u32 q10, q7, q9 \n"
+ "vpaddl.u32 q1, q10 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a,
- const uint8* src_b, int count) {
+ const uint8* src_b,
+ int count) {
volatile uint32 sse;
- asm volatile (
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+ asm volatile(
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked)
-static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
- const uint8* /*src_b*/, int /*count*/) {
+__declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
+ const uint8* /*src_b*/,
+ int /*count*/) {
__asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
sub edx, eax
@@ -151,47 +147,49 @@ static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_SSE2(const uint8* src_a,
- const uint8* src_b, int count) {
+ const uint8* src_b,
+ int count) {
uint32 sse;
- asm volatile ( // NOLINT
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
+ asm volatile( // NOLINT
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu (%0,%1,1),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqu %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu (%0,%1,1),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :
- : "memory", "cc"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ ,
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
- ); // NOLINT
+ ); // NOLINT
return sse;
}
#endif // LIBYUV_DISABLE_X86 etc
@@ -199,20 +197,22 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
#if defined(HAS_SUMSQUAREERROR_SSE2)
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
+ asm volatile( // NOLINT
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
+ "=d"(cpu_info[3])
+ : "a"(info_type));
}
// For gcc/clang but not clangcl.
-#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "cpuid \n"
- : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
+ asm volatile( // NOLINT
+ "cpuid \n"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
+ "=d"(cpu_info[3])
+ : "a"(info_type));
}
#endif
@@ -229,7 +229,8 @@ static int CpuHasSSE2() {
#endif // HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_C(const uint8* src_a,
- const uint8* src_b, int count) {
+ const uint8* src_b,
+ int count) {
uint32 sse = 0u;
for (int x = 0; x < count; ++x) {
int diff = src_a[x] - src_b[x];
@@ -239,9 +240,10 @@ static uint32 SumSquareError_C(const uint8* src_a,
}
double ComputeSumSquareError(const uint8* src_a,
- const uint8* src_b, int count) {
- uint32 (*SumSquareError)(const uint8* src_a,
- const uint8* src_b, int count) = SumSquareError_C;
+ const uint8* src_b,
+ int count) {
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
SumSquareError = SumSquareError_NEON;
#endif
@@ -253,7 +255,7 @@ double ComputeSumSquareError(const uint8* src_a,
const int kBlockSize = 1 << 15;
uint64 sse = 0;
#ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
#endif
for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
diff --git a/files/util/psnr_main.cc b/files/util/psnr_main.cc
index 0518ab84..4d930be4 100644
--- a/files/util/psnr_main.cc
+++ b/files/util/psnr_main.cc
@@ -71,8 +71,8 @@ bool ExtractResolutionFromFilename(const char* name,
// Isolate the .width_height. section of the filename by searching for a
// dot or underscore followed by a digit.
for (int i = 0; name[i]; ++i) {
- if ((name[i] == '.' || name[i] == '_') &&
- name[i + 1] >= '0' && name[i + 1] <= '9') {
+ if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' &&
+ name[i + 1] <= '9') {
int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
if (2 == n) {
return true;
@@ -88,7 +88,7 @@ bool ExtractResolutionFromFilename(const char* name,
return false;
}
fseek(file_org, 0, SEEK_END);
- size_t total_size = ftell(file_org);
+ size_t total_size = ftell(file_org);
fseek(file_org, 0, SEEK_SET);
uint8* const ch_org = new uint8[total_size];
memset(ch_org, 0, total_size);
@@ -109,8 +109,10 @@ bool ExtractResolutionFromFilename(const char* name,
// This can be useful when comparing codecs that are inconsistant about Y
uint8 ScaleY(uint8 y) {
int ny = (y - 16) * 256 / 224;
- if (ny < 0) ny = 0;
- if (ny > 255) ny = 255;
+ if (ny < 0)
+ ny = 0;
+ if (ny > 255)
+ ny = 255;
return static_cast<uint8>(ny);
}
@@ -119,16 +121,18 @@ double GetMSE(double sse, double size) {
return sse / size;
}
-void PrintHelp(const char * program) {
+void PrintHelp(const char* program) {
printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
#ifdef HAVE_JPEG
printf("jpeg or raw YUV 420 supported.\n");
#endif
printf("options:\n");
- printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
- "sequences have the\n");
- printf(" resolution embedded in their filename (ie. "
- "name.1920x800_24Hz_P420.yuv)\n");
+ printf(
+ " -s <width> <height> .... specify YUV size, mandatory if none of the "
+ "sequences have the\n");
+ printf(
+ " resolution embedded in their filename (ie. "
+ "name.1920x800_24Hz_P420.yuv)\n");
printf(" -psnr .................. compute PSNR (default)\n");
printf(" -ssim .................. compute SSIM\n");
printf(" -mse ................... compute MSE\n");
@@ -146,7 +150,8 @@ void PrintHelp(const char * program) {
}
void ParseOptions(int argc, const char* argv[]) {
- if (argc <= 1) PrintHelp(argv[0]);
+ if (argc <= 1)
+ PrintHelp(argv[0]);
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@@ -168,16 +173,16 @@ void ParseOptions(int argc, const char* argv[]) {
} else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
PrintHelp(argv[0]);
} else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
- image_width = atoi(argv[++c]); // NOLINT
- image_height = atoi(argv[++c]); // NOLINT
+ image_width = atoi(argv[++c]); // NOLINT
+ image_height = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
- num_skip_org = atoi(argv[++c]); // NOLINT
- num_skip_rec = atoi(argv[++c]); // NOLINT
+ num_skip_org = atoi(argv[++c]); // NOLINT
+ num_skip_rec = atoi(argv[++c]); // NOLINT
} else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
- num_frames = atoi(argv[++c]); // NOLINT
+ num_frames = atoi(argv[++c]); // NOLINT
#ifdef _OPENMP
} else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
- num_threads = atoi(argv[++c]); // NOLINT
+ num_threads = atoi(argv[++c]); // NOLINT
#endif
} else if (argv[c][0] == '-') {
fprintf(stderr, "Unknown option. %s\n", argv[c]);
@@ -206,11 +211,9 @@ void ParseOptions(int argc, const char* argv[]) {
int org_width, org_height;
int rec_width, rec_height;
bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
- &org_width,
- &org_height);
+ &org_width, &org_height);
bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
- &rec_width,
- &rec_height);
+ &rec_width, &rec_height);
if (org_res_avail) {
if (rec_res_avail) {
if ((org_width == rec_width) && (org_height == rec_height)) {
@@ -234,11 +237,15 @@ void ParseOptions(int argc, const char* argv[]) {
}
}
-bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
- const int y_size, const int uv_size, const size_t total_size,
+bool UpdateMetrics(uint8* ch_org,
+ uint8* ch_rec,
+ const int y_size,
+ const int uv_size,
+ const size_t total_size,
int number_of_frames,
metric* cur_distortion_psnr,
- metric* distorted_frame, bool do_psnr) {
+ metric* distorted_frame,
+ bool do_psnr) {
const int uv_offset = (do_swap_uv ? uv_size : 0);
const uint8* const u_org = ch_org + y_size + uv_offset;
const uint8* const u_rec = ch_rec + y_size;
@@ -247,11 +254,11 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
if (do_psnr) {
#ifdef HAVE_JPEG
double y_err = static_cast<double>(
- libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+ libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
double u_err = static_cast<double>(
- libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+ libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
double v_err = static_cast<double>(
- libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+ libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
#else
double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
@@ -265,17 +272,17 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
- distorted_frame->all = ComputePSNR(total_err,
- static_cast<double>(total_size));
+ distorted_frame->all =
+ ComputePSNR(total_err, static_cast<double>(total_size));
} else {
distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
- distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
- (image_height + 1) / 2);
- distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
- (image_height + 1) / 2);
+ distorted_frame->u =
+ CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2);
+ distorted_frame->v =
+ CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2);
distorted_frame->all =
- (distorted_frame->y + distorted_frame->u + distorted_frame->v)
- / total_size;
+ (distorted_frame->y + distorted_frame->u + distorted_frame->v) /
+ total_size;
distorted_frame->y /= y_size;
distorted_frame->u /= uv_size;
distorted_frame->v /= uv_size;
@@ -330,8 +337,8 @@ int main(int argc, const char* argv[]) {
}
// Open all files to compare to
- FILE** file_rec = new FILE* [num_rec];
- memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+ FILE** file_rec = new FILE*[num_rec];
+ memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
if (file_rec[cur_rec] == NULL) {
@@ -347,20 +354,21 @@ int main(int argc, const char* argv[]) {
const int y_size = image_width * image_height;
const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
- const size_t total_size = y_size + 2 * uv_size; // NOLINT
+ const size_t total_size = y_size + 2 * uv_size; // NOLINT
#if defined(_MSC_VER)
- _fseeki64(file_org,
- static_cast<__int64>(num_skip_org) *
- static_cast<__int64>(total_size), SEEK_SET);
+ _fseeki64(
+ file_org,
+ static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size),
+ SEEK_SET);
#else
fseek(file_org, num_skip_org * total_size, SEEK_SET);
#endif
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
#if defined(_MSC_VER)
- _fseeki64(file_rec[cur_rec],
- static_cast<__int64>(num_skip_rec) *
- static_cast<__int64>(total_size),
- SEEK_SET);
+ _fseeki64(
+ file_rec[cur_rec],
+ static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size),
+ SEEK_SET);
#else
fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
#endif
@@ -420,7 +428,7 @@ int main(int argc, const char* argv[]) {
}
int number_of_frames;
- for (number_of_frames = 0; ; ++number_of_frames) {
+ for (number_of_frames = 0;; ++number_of_frames) {
if (num_frames && number_of_frames >= num_frames)
break;
@@ -432,17 +440,11 @@ int main(int argc, const char* argv[]) {
memcpy(ch_jpeg, ch_org, bytes_org);
memset(ch_org, 0, total_size);
- if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
- ch_org,
- image_width,
- ch_org + y_size,
- (image_width + 1) / 2,
+ if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width,
+ ch_org + y_size, (image_width + 1) / 2,
ch_org + y_size + uv_size,
- (image_width + 1) / 2,
- image_width,
- image_height,
- image_width,
- image_height)) {
+ (image_width + 1) / 2, image_width,
+ image_height, image_width, image_height)) {
delete[] ch_jpeg;
break;
}
@@ -453,8 +455,8 @@ int main(int argc, const char* argv[]) {
}
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
- size_t bytes_rec = fread(ch_rec, sizeof(uint8),
- total_size, file_rec[cur_rec]);
+ size_t bytes_rec =
+ fread(ch_rec, sizeof(uint8), total_size, file_rec[cur_rec]);
if (bytes_rec < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
@@ -462,17 +464,11 @@ int main(int argc, const char* argv[]) {
memcpy(ch_jpeg, ch_rec, bytes_rec);
memset(ch_rec, 0, total_size);
- if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
- ch_rec,
- image_width,
- ch_rec + y_size,
- (image_width + 1) / 2,
+ if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width,
+ ch_rec + y_size, (image_width + 1) / 2,
ch_rec + y_size + uv_size,
- (image_width + 1) / 2,
- image_width,
- image_height,
- image_width,
- image_height)) {
+ (image_width + 1) / 2, image_width,
+ image_height, image_width, image_height)) {
delete[] ch_jpeg;
break;
}
@@ -488,10 +484,8 @@ int main(int argc, const char* argv[]) {
if (do_psnr) {
metric distorted_frame;
metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
- bool ismin = UpdateMetrics(ch_org, ch_rec,
- y_size, uv_size, total_size,
- number_of_frames,
- cur_distortion_psnr,
+ bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
+ number_of_frames, cur_distortion_psnr,
&distorted_frame, true);
if (verbose) {
printf("\t%10.6f", distorted_frame.y);
@@ -504,10 +498,8 @@ int main(int argc, const char* argv[]) {
if (do_ssim) {
metric distorted_frame;
metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
- bool ismin = UpdateMetrics(ch_org, ch_rec,
- y_size, uv_size, total_size,
- number_of_frames,
- cur_distortion_ssim,
+ bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
+ number_of_frames, cur_distortion_ssim,
&distorted_frame, false);
if (verbose) {
printf("\t%10.6f", distorted_frame.y);
@@ -543,24 +535,20 @@ int main(int argc, const char* argv[]) {
}
if (do_psnr) {
- const double global_psnr_y = ComputePSNR(
- cur_distortion_psnr->global_y,
- static_cast<double>(y_size) * number_of_frames);
- const double global_psnr_u = ComputePSNR(
- cur_distortion_psnr->global_u,
- static_cast<double>(uv_size) * number_of_frames);
- const double global_psnr_v = ComputePSNR(
- cur_distortion_psnr->global_v,
- static_cast<double>(uv_size) * number_of_frames);
- const double global_psnr_all = ComputePSNR(
- cur_distortion_psnr->global_all,
- static_cast<double>(total_size) * number_of_frames);
- printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- global_psnr_y,
- global_psnr_u,
- global_psnr_v,
- global_psnr_all,
- number_of_frames);
+ const double global_psnr_y =
+ ComputePSNR(cur_distortion_psnr->global_y,
+ static_cast<double>(y_size) * number_of_frames);
+ const double global_psnr_u =
+ ComputePSNR(cur_distortion_psnr->global_u,
+ static_cast<double>(uv_size) * number_of_frames);
+ const double global_psnr_v =
+ ComputePSNR(cur_distortion_psnr->global_v,
+ static_cast<double>(uv_size) * number_of_frames);
+ const double global_psnr_all =
+ ComputePSNR(cur_distortion_psnr->global_all,
+ static_cast<double>(total_size) * number_of_frames);
+ printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y,
+ global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames);
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
}
@@ -570,20 +558,14 @@ int main(int argc, const char* argv[]) {
if (!quiet) {
printf("Avg:");
if (do_psnr) {
- printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- cur_distortion_psnr->y,
- cur_distortion_psnr->u,
- cur_distortion_psnr->v,
- cur_distortion_psnr->all,
- number_of_frames);
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y,
+ cur_distortion_psnr->u, cur_distortion_psnr->v,
+ cur_distortion_psnr->all, number_of_frames);
}
if (do_ssim) {
- printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- cur_distortion_ssim->y,
- cur_distortion_ssim->u,
- cur_distortion_ssim->v,
- cur_distortion_ssim->all,
- number_of_frames);
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y,
+ cur_distortion_ssim->u, cur_distortion_ssim->v,
+ cur_distortion_ssim->all, number_of_frames);
}
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
@@ -594,19 +576,15 @@ int main(int argc, const char* argv[]) {
printf("Min:");
if (do_psnr) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- cur_distortion_psnr->min_y,
- cur_distortion_psnr->min_u,
- cur_distortion_psnr->min_v,
- cur_distortion_psnr->min_all,
- cur_distortion_psnr->min_frame);
+ cur_distortion_psnr->min_y, cur_distortion_psnr->min_u,
+ cur_distortion_psnr->min_v, cur_distortion_psnr->min_all,
+ cur_distortion_psnr->min_frame);
}
if (do_ssim) {
printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- cur_distortion_ssim->min_y,
- cur_distortion_ssim->min_u,
- cur_distortion_ssim->min_v,
- cur_distortion_ssim->min_all,
- cur_distortion_ssim->min_frame);
+ cur_distortion_ssim->min_y, cur_distortion_ssim->min_u,
+ cur_distortion_ssim->min_v, cur_distortion_ssim->min_all,
+ cur_distortion_ssim->min_frame);
}
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
@@ -615,20 +593,20 @@ int main(int argc, const char* argv[]) {
}
if (do_mse) {
- double global_mse_y = GetMSE(cur_distortion_psnr->global_y,
- static_cast<double>(y_size) * number_of_frames);
- double global_mse_u = GetMSE(cur_distortion_psnr->global_u,
- static_cast<double>(uv_size) * number_of_frames);
- double global_mse_v = GetMSE(cur_distortion_psnr->global_v,
- static_cast<double>(uv_size) * number_of_frames);
- double global_mse_all = GetMSE(cur_distortion_psnr->global_all,
- static_cast<double>(total_size) * number_of_frames);
- printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
- global_mse_y,
- global_mse_u,
- global_mse_v,
- global_mse_all,
- number_of_frames);
+ double global_mse_y =
+ GetMSE(cur_distortion_psnr->global_y,
+ static_cast<double>(y_size) * number_of_frames);
+ double global_mse_u =
+ GetMSE(cur_distortion_psnr->global_u,
+ static_cast<double>(uv_size) * number_of_frames);
+ double global_mse_v =
+ GetMSE(cur_distortion_psnr->global_v,
+ static_cast<double>(uv_size) * number_of_frames);
+ double global_mse_all =
+ GetMSE(cur_distortion_psnr->global_all,
+ static_cast<double>(total_size) * number_of_frames);
+ printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y,
+ global_mse_u, global_mse_v, global_mse_all, number_of_frames);
if (show_name) {
printf("\t%s", argv[fileindex_rec + cur_rec]);
}
diff --git a/files/util/ssim.cc b/files/util/ssim.cc
index 5a6399b7..43e725d8 100644
--- a/files/util/ssim.cc
+++ b/files/util/ssim.cc
@@ -16,11 +16,11 @@
extern "C" {
#endif
-typedef unsigned int uint32; // NOLINT
-typedef unsigned short uint16; // NOLINT
+typedef unsigned int uint32; // NOLINT
+typedef unsigned short uint16; // NOLINT
#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
- (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
+ (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
#define __SSE2__
#endif
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
@@ -38,22 +38,29 @@ enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
// The maximum value (11 x 11) must be less than 128 to avoid sign
// problems during the calls to _mm_mullo_epi16().
static const int K[KERNEL_SIZE] = {
- 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
+ 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
};
static const double kiW[KERNEL + 1 + 1] = {
- 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
- 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
- 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
- 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
- 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
+ 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
+ 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
+ 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
+ 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
+ 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
};
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
-#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
-#define MAKE_WEIGHT(L) \
- { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
- PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
+#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
+#define MAKE_WEIGHT(L) \
+ { \
+ { \
+ { \
+ PWEIGHT(L, 0) \
+ , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
+ PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \
+ } \
+ } \
+ }
// We need this union trick to be able to initialize constant static __m128i
// values. We can't call _mm_set_epi16() for static compile-time initialization.
@@ -62,32 +69,36 @@ static const struct {
uint16 i16_[8];
__m128i m_;
} values_;
-} W0 = MAKE_WEIGHT(0),
- W1 = MAKE_WEIGHT(1),
- W2 = MAKE_WEIGHT(2),
+} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
W3 = MAKE_WEIGHT(3);
- // ... the rest is symmetric.
+// ... the rest is symmetric.
#undef MAKE_WEIGHT
#undef PWEIGHT
#endif
// Common final expression for SSIM, once the weighted sums are known.
-static double FinalizeSSIM(double iw, double xm, double ym,
- double xxm, double xym, double yym) {
+static double FinalizeSSIM(double iw,
+ double xm,
+ double ym,
+ double xxm,
+ double xym,
+ double yym) {
const double iwx = xm * iw;
const double iwy = ym * iw;
double sxx = xxm * iw - iwx * iwx;
double syy = yym * iw - iwy * iwy;
// small errors are possible, due to rounding. Clamp to zero.
- if (sxx < 0.) sxx = 0.;
- if (syy < 0.) syy = 0.;
+ if (sxx < 0.)
+ sxx = 0.;
+ if (syy < 0.)
+ syy = 0.;
const double sxsy = sqrt(sxx * syy);
const double sxy = xym * iw - iwx * iwy;
static const double C11 = (0.01 * 0.01) * (255 * 255);
static const double C22 = (0.03 * 0.03) * (255 * 255);
static const double C33 = (0.015 * 0.015) * (255 * 255);
const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
- const double c = (2. * sxsy + C22) / (sxx + syy + C22);
+ const double c = (2. * sxsy + C22) / (sxx + syy + C22);
const double s = (sxy + C33) / (sxsy + C33);
return l * c * s;
}
@@ -98,15 +109,21 @@ static double FinalizeSSIM(double iw, double xm, double ym,
// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
// with a diff of 255, squared. The maximum error is thus 0x4388241,
// which fits into 32 bits integers.
-double GetSSIM(const uint8 *org, const uint8 *rec,
- int xo, int yo, int W, int H, int stride) {
+double GetSSIM(const uint8* org,
+ const uint8* rec,
+ int xo,
+ int yo,
+ int W,
+ int H,
+ int stride) {
uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
org += (yo - KERNEL) * stride;
org += (xo - KERNEL);
rec += (yo - KERNEL) * stride;
rec += (xo - KERNEL);
for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
- if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
+ if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
+ continue;
const int Wy = K[y_];
for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
const int Wxy = Wy * K[x_];
@@ -114,8 +131,8 @@ double GetSSIM(const uint8 *org, const uint8 *rec,
const int org_x = org[x_];
const int rec_x = rec[x_];
ws += Wxy;
- xm += Wxy * org_x;
- ym += Wxy * rec_x;
+ xm += Wxy * org_x;
+ ym += Wxy * rec_x;
xxm += Wxy * org_x * org_x;
xym += Wxy * org_x * rec_x;
yym += Wxy * rec_x * rec_x;
@@ -125,8 +142,11 @@ double GetSSIM(const uint8 *org, const uint8 *rec,
return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
}
-double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
- int xo, int yo, int stride,
+double GetSSIMFullKernel(const uint8* org,
+ const uint8* rec,
+ int xo,
+ int yo,
+ int stride,
double area_weight) {
uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
@@ -161,8 +181,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int ll2 = rec[dy2 - x];
const int lr2 = rec[dy2 + x];
- xm += Wxy * (ul1 + ur1 + ll1 + lr1);
- ym += Wxy * (ul2 + ur2 + ll2 + lr2);
+ xm += Wxy * (ul1 + ur1 + ll1 + lr1);
+ ym += Wxy * (ul2 + ur2 + ll2 + lr2);
xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
@@ -189,8 +209,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int l2 = rec[-y];
const int r2 = rec[y];
- xm += Wxy * (u1 + d1 + l1 + r1);
- ym += Wxy * (u2 + d2 + l2 + r2);
+ xm += Wxy * (u1 + d1 + l1 + r1);
+ ym += Wxy * (u2 + d2 + l2 + r2);
xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
@@ -201,13 +221,13 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
const int s1 = org[0];
const int s2 = rec[0];
- xm += Wxy * s1;
- ym += Wxy * s2;
+ xm += Wxy * s1;
+ ym += Wxy * s2;
xxm += Wxy * s1 * s1;
xym += Wxy * s1 * s2;
yym += Wxy * s2 * s2;
-#else // __SSE2__
+#else // __SSE2__
org += (yo - KERNEL) * stride + (xo - KERNEL);
rec += (yo - KERNEL) * stride + (xo - KERNEL);
@@ -221,29 +241,31 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
// Read 8 pixels at line #L, and convert to 16bit, perform weighting
// and acccumulate.
-#define LOAD_LINE_PAIR(L, WEIGHT) do { \
- const __m128i v0 = \
- _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
- const __m128i v1 = \
- _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
- const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
- const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
- const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
- const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
- x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
- y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
- x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
- y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
- xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
- xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
- yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
-} while (0)
-
-#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
- uint32 tmp[4]; \
- _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
- (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
-} while (0)
+#define LOAD_LINE_PAIR(L, WEIGHT) \
+ do { \
+ const __m128i v0 = \
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
+ const __m128i v1 = \
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
+ const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
+ const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
+ const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
+ const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
+ x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
+ y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
+ x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
+ y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
+ xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
+ xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
+ yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
+ } while (0)
+
+#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
+ do { \
+ uint32 tmp[4]; \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
+ (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
+ } while (0)
LOAD_LINE_PAIR(0, W0);
LOAD_LINE_PAIR(1, W1);
@@ -266,10 +288,14 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
}
-static int start_max(int x, int y) { return (x > y) ? x : y; }
+static int start_max(int x, int y) {
+ return (x > y) ? x : y;
+}
-double CalcSSIM(const uint8 *org, const uint8 *rec,
- const int image_width, const int image_height) {
+double CalcSSIM(const uint8* org,
+ const uint8* rec,
+ const int image_width,
+ const int image_height) {
double SSIM = 0.;
const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
@@ -284,7 +310,7 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
}
#ifdef _OPENMP
- #pragma omp parallel for reduction(+: SSIM)
+#pragma omp parallel for reduction(+ : SSIM)
#endif
for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
for (int i = 0; i < KERNEL_X; ++i) {
@@ -302,8 +328,8 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
// NOTE: we could use similar method for the left-most pixels too.
const int kScratchWidth = 8;
const int kScratchStride = kScratchWidth + KERNEL + 1;
- uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
- uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
+ uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
+ uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
for (int k = 0; k < KERNEL_SIZE; ++k) {
const int offset =
@@ -311,9 +337,9 @@ double CalcSSIM(const uint8 *org, const uint8 *rec,
memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
}
- for (int k = 0; k <= KERNEL_X + 1; ++k) {
- SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
- KERNEL + k, KERNEL, kScratchStride, kiW[k]);
+ for (int k = 0; k <= KERNEL_X + 1; ++k) {
+ SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
+ kScratchStride, kiW[k]);
}
}
}
@@ -333,4 +359,3 @@ double CalcLSSIM(double ssim) {
#ifdef __cplusplus
} // extern "C"
#endif
-
diff --git a/files/util/ssim.h b/files/util/ssim.h
index 430eb71c..4647f45d 100644
--- a/files/util/ssim.h
+++ b/files/util/ssim.h
@@ -10,7 +10,7 @@
// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-#ifndef UTIL_SSIM_H_ // NOLINT
+#ifndef UTIL_SSIM_H_
#define UTIL_SSIM_H_
#include <math.h> // For log10()
@@ -24,8 +24,10 @@ typedef unsigned char uint8;
#define UINT8_TYPE_DEFINED
#endif
-double CalcSSIM(const uint8* org, const uint8* rec,
- const int image_width, const int image_height);
+double CalcSSIM(const uint8* org,
+ const uint8* rec,
+ const int image_width,
+ const int image_height);
double CalcLSSIM(double ssim);
@@ -33,4 +35,4 @@ double CalcLSSIM(double ssim);
} // extern "C"
#endif
-#endif // UTIL_SSIM_H_ // NOLINT
+#endif // UTIL_SSIM_H_