diff options
author | Frank Barchard <fbarchard@google.com> | 2017-02-22 18:01:07 -0800 |
---|---|---|
committer | Frank Barchard <fbarchard@google.com> | 2017-03-06 09:54:15 -0800 |
commit | b83bb38f0a92bedeb52baa31e515220927ef53bb (patch) | |
tree | a31c9da19db3f909cad22293ad2964d1c41c953a | |
parent | 04676c9f110180a5ae1fa259a38fab17101c6b5b (diff) | |
download | libyuv-b83bb38f0a92bedeb52baa31e515220927ef53bb.tar.gz |
libyuv r1645 to fix android build warnings
r1602 under android.mk had unused parameter build warnings.
The warnings were disabled. This CL fixes the source and re-enables
the warning.
Bug: 35099807
Test: mm for libyuv builds cleanly.
Change-Id: If6b344ca39b2c321e277421cdeb817a5b1cc2514
132 files changed, 28557 insertions, 15381 deletions
diff --git a/README.version b/README.version index af9d7c6e..6d6e0d4b 100644 --- a/README.version +++ b/README.version @@ -1,3 +1,3 @@ -Version: r1602 +Version: r1645 BugComponent: 42195 Owner: lajos diff --git a/files/Android.mk b/files/Android.mk index 217114ec..cc17bde2 100644 --- a/files/Android.mk +++ b/files/Android.mk @@ -1,4 +1,4 @@ -# This is the Android makefile for libyuv for both platform and NDK. +# This is the Android makefile for libyuv for NDK. LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) @@ -8,45 +8,48 @@ LOCAL_CPP_EXTENSION := .cc LOCAL_SRC_FILES := \ source/compare.cc \ source/compare_common.cc \ - source/compare_neon64.cc \ source/compare_gcc.cc \ + source/compare_neon.cc \ + source/compare_neon64.cc \ source/convert.cc \ source/convert_argb.cc \ source/convert_from.cc \ source/convert_from_argb.cc \ + source/convert_jpeg.cc \ source/convert_to_argb.cc \ source/convert_to_i420.cc \ source/cpu_id.cc \ + source/mjpeg_decoder.cc \ + source/mjpeg_validate.cc \ source/planar_functions.cc \ source/rotate.cc \ source/rotate_any.cc \ source/rotate_argb.cc \ source/rotate_common.cc \ - source/rotate_mips.cc \ - source/rotate_neon64.cc \ + source/rotate_dspr2.cc \ source/rotate_gcc.cc \ + source/rotate_msa.cc \ + source/rotate_neon.cc \ + source/rotate_neon64.cc \ source/row_any.cc \ source/row_common.cc \ - source/row_mips.cc \ + source/row_dspr2.cc \ + source/row_gcc.cc \ + source/row_msa.cc \ + source/row_neon.cc \ source/row_neon64.cc \ - source/row_gcc.cc \ source/scale.cc \ source/scale_any.cc \ source/scale_argb.cc \ source/scale_common.cc \ - source/scale_mips.cc \ - source/scale_neon64.cc \ + source/scale_dspr2.cc \ source/scale_gcc.cc \ - source/video_common.cc \ - source/compare_neon.cc \ - source/rotate_neon.cc \ - source/row_neon.cc \ + source/scale_msa.cc \ source/scale_neon.cc \ - source/mjpeg_decoder.cc \ - source/convert_jpeg.cc \ - source/mjpeg_validate.cc + source/scale_neon64.cc \ + source/video_common.cc -common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG -Wno-unused-parameter +common_CFLAGS := -Wall -fexceptions -DHAVE_JPEG LOCAL_CFLAGS += $(common_CFLAGS) LOCAL_SHARED_LIBRARIES := libjpeg LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include diff --git a/files/BUILD.gn b/files/BUILD.gn index b091cbc2..57771b72 100644 --- a/files/BUILD.gn +++ b/files/BUILD.gn @@ -6,19 +6,37 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import("//build/config/arm.gni") -import("//build/config/sanitizers/sanitizers.gni") +import("libyuv.gni") +import("//testing/test.gni") config("libyuv_config") { - include_dirs = [ - ".", - "include", - ] + include_dirs = [ "include" ] + if (is_android && current_cpu=="arm64") { + ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ] + } + if (is_android && current_cpu != "arm64") { + ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ] + } } -use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)) +# This target is built when no specific target is specified on the command line. +group("default") { + testonly = true + deps = [ + ":libyuv", + ] + if (libyuv_include_tests) { + deps += [ + ":compare", + ":convert", + ":cpuid", + ":libyuv_unittest", + ":psnr", + ] + } +} -source_set("libyuv") { +static_library("libyuv") { sources = [ # Headers "include/libyuv.h", @@ -61,57 +79,56 @@ source_set("libyuv") { "source/rotate_any.cc", "source/rotate_argb.cc", "source/rotate_common.cc", - "source/rotate_mips.cc", + "source/rotate_dspr2.cc", "source/rotate_gcc.cc", "source/rotate_win.cc", "source/row_any.cc", "source/row_common.cc", - "source/row_mips.cc", + "source/row_dspr2.cc", "source/row_gcc.cc", "source/row_win.cc", "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", "source/scale_common.cc", - "source/scale_mips.cc", + "source/scale_dspr2.cc", "source/scale_gcc.cc", "source/scale_win.cc", "source/video_common.cc", ] - configs -= [ "//build/config/compiler:chromium_code" ] - configs += [ "//build/config/compiler:no_chromium_code" ] - public_configs = [ ":libyuv_config" ] defines = [] + deps = [] if (!is_ios) { defines += [ "HAVE_JPEG" ] + deps += [ "//third_party:jpeg" ] } - if (is_msan) { - # MemorySanitizer does not support assembly code yet. - # http://crbug.com/344505 - defines += [ "LIBYUV_DISABLE_X86" ] + if (libyuv_use_neon) { + deps += [ ":libyuv_neon" ] } - deps = [ - "//third_party:jpeg", - ] - - if (use_neon) { - deps += [ ":libyuv_neon" ] + if (libyuv_use_msa) { + deps += [ ":libyuv_msa" ] } - if (is_nacl) { - # Always enable optimization under NaCl to workaround crbug.com/538243 . + # Always enable optimization for Release and NaCl builds (to workaround + # crbug.com/538243). + if (!is_debug || is_nacl) { configs -= [ "//build/config/compiler:default_optimization" ] + # Enable optimize for speed (-O2) over size (-Os). configs += [ "//build/config/compiler:optimize_max" ] } + + # To enable AVX2 or other cpu optimization, pass flag here + # cflags = [ "-mavx2" ] + } -if (use_neon) { +if (libyuv_use_neon) { static_library("libyuv_neon") { sources = [ # ARM Source Files @@ -127,9 +144,163 @@ if (use_neon) { public_configs = [ ":libyuv_config" ] + # Always enable optimization for Release and NaCl builds (to workaround + # crbug.com/538243). + if (!is_debug) { + configs -= [ "//build/config/compiler:default_optimization" ] + # Enable optimize for speed (-O2) over size (-Os). + configs += [ "//build/config/compiler:optimize_max" ] + } + if (current_cpu != "arm64") { configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ "-mfpu=neon" ] } } } + +if (libyuv_use_msa) { + static_library("libyuv_msa") { + sources = [ + # MSA Source Files + "source/row_msa.cc", + "source/scale_msa.cc", + "source/rotate_msa.cc", + ] + + public_configs = [ ":libyuv_config" ] + } +} + +if (libyuv_include_tests) { + config("libyuv_unittest_warnings_config") { + if (!is_win) { + cflags = [ + # TODO(fbarchard): Fix sign and unused variable warnings. + "-Wno-sign-compare", + "-Wno-unused-variable" + ] + } + if (is_win) { + cflags = [ + "/wd4245", # signed/unsigned mismatch + "/wd4189", # local variable is initialized but not referenced + ] + } + } + config("libyuv_unittest_config") { + defines = [ "GTEST_RELATIVE_PATH" ] + } + + test("libyuv_unittest") { + testonly = true + + sources = [ + # headers + "unit_test/unit_test.h", + # sources + "unit_test/basictypes_test.cc", + "unit_test/compare_test.cc", + "unit_test/color_test.cc", + "unit_test/convert_test.cc", + "unit_test/cpu_test.cc", + "unit_test/math_test.cc", + "unit_test/planar_test.cc", + "unit_test/rotate_argb_test.cc", + "unit_test/rotate_test.cc", + "unit_test/scale_argb_test.cc", + "unit_test/scale_test.cc", + "unit_test/unit_test.cc", + "unit_test/video_common_test.cc", + ] + + deps = [ + ":libyuv", + "//testing/gtest", + "//third_party/gflags", + ] + + configs += [ ":libyuv_unittest_warnings_config" ] + + public_deps = [ "//testing/gtest" ] + public_configs = [ ":libyuv_unittest_config" ] + + defines = [] + + if (is_linux) { + cflags = [ "-fexceptions" ] + } + if (is_ios) { + configs -= [ "//build/config/compiler:default_symbols" ] + configs += [ "//build/config/compiler:symbols" ] + cflags = [ "-Wno-sometimes-uninitialized" ] + } + if (!is_ios && !libyuv_disable_jpeg) { + defines += [ "HAVE_JPEG" ] + } + if (is_android) { + deps += [ "//testing/android/native_test:native_test_native_code" ] + } + + # TODO(YangZhang): These lines can be removed when high accuracy + # YUV to RGB to Neon is ported. + if ((target_cpu=="armv7" || target_cpu=="armv7s" || + (target_cpu=="arm" && arm_version >= 7) || target_cpu=="arm64") && + (arm_use_neon || arm_optionally_use_neon)) { + defines += [ "LIBYUV_NEON" ] + } + + defines += [ + # Enable the following 3 macros to turn off assembly for specified CPU. + # "LIBYUV_DISABLE_X86", + # "LIBYUV_DISABLE_NEON", + # "LIBYUV_DISABLE_DSPR2", + # Enable the following macro to build libyuv as a shared library (dll). + # "LIBYUV_USING_SHARED_LIBRARY" + ] + } + + executable("compare") { + sources = [ + # sources + "util/compare.cc" + ] + deps = [ ":libyuv" ] + if (is_linux) { + cflags = [ "-fexceptions" ] + } + } + + executable("convert") { + sources = [ + # sources + "util/convert.cc" + ] + deps = [ ":libyuv" ] + if (is_linux) { + cflags = [ "-fexceptions" ] + } + } + + executable("psnr") { + sources = [ + # sources + "util/psnr_main.cc", + "util/psnr.cc", + "util/ssim.cc" + ] + deps = [ ":libyuv" ] + + if (!is_ios && !libyuv_disable_jpeg) { + defines = [ "HAVE_JPEG" ] + } + } + + executable("cpuid") { + sources = [ + # sources + "util/cpuid.c" + ] + deps = [ ":libyuv" ] + } +} diff --git a/files/CM_linux_packages.cmake b/files/CM_linux_packages.cmake new file mode 100644 index 00000000..5f676f89 --- /dev/null +++ b/files/CM_linux_packages.cmake @@ -0,0 +1,69 @@ +# determine the version number from the #define in libyuv/version.h +EXECUTE_PROCESS ( + COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE YUV_VERSION_NUMBER + OUTPUT_STRIP_TRAILING_WHITESPACE ) +SET ( YUV_VER_MAJOR 0 ) +SET ( YUV_VER_MINOR 0 ) +SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} ) +SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} ) +MESSAGE ( "Building ver.: ${YUV_VERSION}" ) + +# is this a 32-bit or 64-bit build? +IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 ) + SET ( YUV_BIT_SIZE 64 ) +ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 ) + SET ( YUV_BIT_SIZE 32 ) +ELSE () + MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" ) +ENDIF () + +# detect if this is a ARM build +STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos) +IF ( ${pos} EQUAL -1 ) + SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE ) +ELSE () + MESSAGE ( "Cross compiling for ARM7" ) + SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE ) +ENDIF () +STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos) +IF ( ${pos} EQUAL -1 ) + SET ( YUV_COMPILE_FOR_ARM7 FALSE ) +ELSE () + MESSAGE ( "Compiling for ARM" ) + SET ( YUV_COMPILE_FOR_ARM7 TRUE ) +ENDIF () + +# setup the sytem name, such as "x86-32", "amd-64", and "arm-32 +IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} ) + SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" ) +ELSE () + IF ( YUV_BIT_SIZE EQUAL 32 ) + SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" ) + ELSE () + SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" ) + ENDIF () +ENDIF () +MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" ) + +# define all the variables needed by CPack to create .deb and .rpm packages +SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" ) +SET ( CPACK_PACKAGE_CONTACT "fbarchard@chromium.org" ) +SET ( CPACK_PACKAGE_VERSION ${YUV_VERSION} ) +SET ( CPACK_PACKAGE_VERSION_MAJOR ${YUV_VER_MAJOR} ) +SET ( CPACK_PACKAGE_VERSION_MINOR ${YUV_VER_MINOR} ) +SET ( CPACK_PACKAGE_VERSION_PATCH ${YUV_VER_PATCH} ) +SET ( CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/LICENSE ) +SET ( CPACK_SYSTEM_NAME "linux-${YUV_SYSTEM_NAME}" ) +SET ( CPACK_PACKAGE_NAME "libyuv" ) +SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "YUV library" ) +SET ( CPACK_PACKAGE_DESCRIPTION "YUV library and YUV conversion tool" ) +SET ( CPACK_DEBIAN_PACKAGE_SECTION "other" ) +SET ( CPACK_DEBIAN_PACKAGE_PRIORITY "optional" ) +SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER "Frank Barchard <fbarchard@chromium.org>" ) +SET ( CPACK_GENERATOR "DEB;RPM" ) + +# create the .deb and .rpm files (you'll need build-essential and rpm tools) +INCLUDE( CPack ) + diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt index 718b47ad..7c95487f 100644 --- a/files/CMakeLists.txt +++ b/files/CMakeLists.txt @@ -1,110 +1,45 @@ -cmake_minimum_required(VERSION 2.8) - # CMakeLists for libyuv # Originally created for "roxlu build system" to compile libyuv on windows # Run with -DTEST=ON to build unit tests -option(TEST "Built unit tests" OFF) - -set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR}) -set(ly_src_dir ${ly_base_dir}/source/) -set(ly_inc_dir ${ly_base_dir}/include) -set(ly_lib_name "yuv") - -set(ly_source_files - ${ly_src_dir}/compare.cc - ${ly_src_dir}/compare_common.cc - ${ly_src_dir}/compare_neon.cc - ${ly_src_dir}/compare_neon64.cc - ${ly_src_dir}/compare_gcc.cc - ${ly_src_dir}/compare_win.cc - ${ly_src_dir}/convert.cc - ${ly_src_dir}/convert_argb.cc - ${ly_src_dir}/convert_from.cc - ${ly_src_dir}/convert_from_argb.cc - ${ly_src_dir}/convert_jpeg.cc - ${ly_src_dir}/convert_to_argb.cc - ${ly_src_dir}/convert_to_i420.cc - ${ly_src_dir}/cpu_id.cc - ${ly_src_dir}/mjpeg_decoder.cc - ${ly_src_dir}/mjpeg_validate.cc - ${ly_src_dir}/planar_functions.cc - ${ly_src_dir}/rotate.cc - ${ly_src_dir}/rotate_any.cc - ${ly_src_dir}/rotate_argb.cc - ${ly_src_dir}/rotate_common.cc - ${ly_src_dir}/rotate_mips.cc - ${ly_src_dir}/rotate_neon.cc - ${ly_src_dir}/rotate_neon64.cc - ${ly_src_dir}/rotate_gcc.cc - ${ly_src_dir}/rotate_win.cc - ${ly_src_dir}/row_any.cc - ${ly_src_dir}/row_common.cc - ${ly_src_dir}/row_mips.cc - ${ly_src_dir}/row_neon.cc - ${ly_src_dir}/row_neon64.cc - ${ly_src_dir}/row_gcc.cc - ${ly_src_dir}/row_win.cc - ${ly_src_dir}/scale.cc - ${ly_src_dir}/scale_any.cc - ${ly_src_dir}/scale_argb.cc - ${ly_src_dir}/scale_common.cc - ${ly_src_dir}/scale_mips.cc - ${ly_src_dir}/scale_neon.cc - ${ly_src_dir}/scale_neon64.cc - ${ly_src_dir}/scale_gcc.cc - ${ly_src_dir}/scale_win.cc - ${ly_src_dir}/video_common.cc -) - -set(ly_unittest_sources - ${ly_base_dir}/unit_test/basictypes_test.cc - ${ly_base_dir}/unit_test/color_test.cc - ${ly_base_dir}/unit_test/compare_test.cc - ${ly_base_dir}/unit_test/convert_test.cc - ${ly_base_dir}/unit_test/cpu_test.cc - ${ly_base_dir}/unit_test/math_test.cc - ${ly_base_dir}/unit_test/planar_test.cc - ${ly_base_dir}/unit_test/rotate_argb_test.cc - ${ly_base_dir}/unit_test/rotate_test.cc - ${ly_base_dir}/unit_test/scale_argb_test.cc - ${ly_base_dir}/unit_test/scale_test.cc - ${ly_base_dir}/unit_test/unit_test.cc - ${ly_base_dir}/unit_test/video_common_test.cc -) - -set(ly_header_files - ${ly_inc_dir}/libyuv/basic_types.h - ${ly_inc_dir}/libyuv/compare.h - ${ly_inc_dir}/libyuv/convert.h - ${ly_inc_dir}/libyuv/convert_argb.h - ${ly_inc_dir}/libyuv/convert_from.h - ${ly_inc_dir}/libyuv/convert_from_argb.h - ${ly_inc_dir}/libyuv/cpu_id.h - ${ly_inc_dir}/libyuv/planar_functions.h - ${ly_inc_dir}/libyuv/rotate.h - ${ly_inc_dir}/libyuv/rotate_argb.h - ${ly_inc_dir}/libyuv/rotate_row.h - ${ly_inc_dir}/libyuv/row.h - ${ly_inc_dir}/libyuv/scale.h - ${ly_inc_dir}/libyuv/scale_argb.h - ${ly_inc_dir}/libyuv/scale_row.h - ${ly_inc_dir}/libyuv/version.h - ${ly_inc_dir}/libyuv/video_common.h - ${ly_inc_dir}/libyuv/mjpeg_decoder.h -) - -include_directories(${ly_inc_dir}) - -add_library(${ly_lib_name} STATIC ${ly_source_files}) - -add_executable(convert ${ly_base_dir}/util/convert.cc) -target_link_libraries(convert ${ly_lib_name}) - -include(FindJPEG) + +PROJECT ( YUV C CXX ) # "C" is required even for C++ projects +CMAKE_MINIMUM_REQUIRED( VERSION 2.8 ) +OPTION( TEST "Built unit tests" OFF ) + +SET ( ly_base_dir ${PROJECT_SOURCE_DIR} ) +SET ( ly_src_dir ${ly_base_dir}/source ) +SET ( ly_inc_dir ${ly_base_dir}/include ) +SET ( ly_tst_dir ${ly_base_dir}/unit_test ) +SET ( ly_lib_name yuv ) +SET ( ly_lib_static ${ly_lib_name} ) +SET ( ly_lib_shared ${ly_lib_name}_shared ) + +FILE ( GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc ) +LIST ( SORT ly_source_files ) + +FILE ( GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc ) +LIST ( SORT ly_unittest_sources ) + +INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} ) + +# this creates the static library (.a) +ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} ) + +# this creates the shared library (.so) +ADD_LIBRARY ( ${ly_lib_shared} SHARED ${ly_source_files} ) +SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" ) +SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES PREFIX "lib" ) + +# this creates the conversion tool +ADD_EXECUTABLE ( convert ${ly_base_dir}/util/convert.cc ) +TARGET_LINK_LIBRARIES ( convert ${ly_lib_static} ) + + +INCLUDE ( FindJPEG ) if (JPEG_FOUND) - include_directories(${JPEG_INCLUDE_DIR}) - target_link_libraries(convert ${JPEG_LIBRARY}) - add_definitions(-DHAVE_JPEG) + include_directories( ${JPEG_INCLUDE_DIR} ) + target_link_libraries( convert ${JPEG_LIBRARY} ) + add_definitions( -DHAVE_JPEG ) endif() if(TEST) @@ -128,15 +63,21 @@ if(TEST) if (JPEG_FOUND) target_link_libraries(libyuv_unittest ${JPEG_LIBRARY}) endif() - + if(NACL AND NACL_LIBC STREQUAL "newlib") target_link_libraries(libyuv_unittest glibc-compat) endif() target_link_libraries(libyuv_unittest gflags) - endif() -install(TARGETS ${ly_lib_name} DESTINATION lib) -install(FILES ${ly_header_files} DESTINATION include/libyuv) -install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/) + +# install the conversion tool, .so, .a, and all the header files +INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/convert DESTINATION bin RENAME yuvconvert ) +INSTALL ( TARGETS ${ly_lib_static} DESTINATION lib ) +INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib ) +INSTALL ( DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include ) + +# create the .deb and .rpm packages using cpack +INCLUDE ( CM_linux_packages.cmake ) + @@ -2,41 +2,452 @@ vars = { # Override root_dir in your .gclient's custom_vars to specify a custom root # folder name. 'root_dir': 'libyuv', - 'extra_gyp_flag': '-Dextra_gyp_flag=0', 'chromium_git': 'https://chromium.googlesource.com', - - # Roll the Chromium Git hash to pick up newer versions of all the - # dependencies and tools linked to in setup_links.py. - 'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0', + 'chromium_revision': '222a3fe7a738486a887bb53cffb8e3b52376f609', + 'swarming_revision': 'ebc8dab6f8b8d79ec221c94de39a921145abd404', + # Three lines of non-changing comments so that + # the commit queue can handle CLs rolling lss + # and whatever else without interference from each other. + 'lss_revision': '3f6478ac95edf86cd3da300c2c0d34a438f5dbeb', + # Three lines of non-changing comments so that + # the commit queue can handle CLs rolling catapult + # and whatever else without interference from each other. + 'catapult_revision': '4ee31ea3b497ffe08391e88a5434e0a340e48342', } -# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than -# https; the latter can cause problems for users behind proxies. deps = { + Var('root_dir') + '/build': + Var('chromium_git') + '/chromium/src/build' + '@' + '47e07d6798693fd71c02e25097c97865b5271c40', + Var('root_dir') + '/buildtools': + Var('chromium_git') + '/chromium/buildtools.git' + '@' + 'a7cc7a3e21a061975b33dcdcd81a9716ba614c3c', + Var('root_dir') + '/testing': + Var('chromium_git') + '/chromium/src/testing' + '@' + '178a302b13e943c679f3bbeb0a7e511f7c318404', + Var('root_dir') + '/testing/gtest': + Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '6f8a66431cb592dad629028a50b3dd418a408c87', + Var('root_dir') + '/testing/gmock': + Var('chromium_git') + '/external/googlemock.git' + '@' + '0421b6f358139f02e102c9c332ce19a33faf75be', # from svn revision 566 + Var('root_dir') + '/third_party': + Var('chromium_git') + '/chromium/src/third_party' + '@' + '4f196478f68c139a5deec388fd1f426a9251b4b0', + Var('root_dir') + '/third_party/catapult': + Var('chromium_git') + '/external/github.com/catapult-project/catapult.git' + '@' + Var('catapult_revision'), + Var('root_dir') + '/third_party/colorama/src': + Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8', + Var('root_dir') + '/third_party/libjpeg_turbo': + Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '7260e4d8b8e1e40b17f03fafdf1cd83296900f76', + Var('root_dir') + '/third_party/yasm/source/patched-yasm': + Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '7da28c6c7c6a1387217352ce02b31754deb54d2a', + Var('root_dir') + '/tools': + Var('chromium_git') + '/chromium/src/tools' + '@' + '54fd165044db88eca930ab9d20a6340b76136d91', + Var('root_dir') + '/tools/gyp': + Var('chromium_git') + '/external/gyp.git' + '@' + 'e7079f0e0e14108ab0dba58728ff219637458563', + Var('root_dir') + '/tools/swarming_client': + Var('chromium_git') + '/external/swarming.client.git' + '@' + Var('swarming_revision'), + + # libyuv-only dependencies (not present in Chromium). + Var('root_dir') + '/third_party/gflags': + Var('chromium_git') + '/external/webrtc/deps/third_party/gflags' + '@' + '892576179b45861b53e04a112996a738309cf364', Var('root_dir') + '/third_party/gflags/src': - Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca', + Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca', + Var('root_dir') + '/third_party/gtest-parallel': + Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '8768563f5c580f8fc416a13c35c8f23b8a602821', +} + +deps_os = { + 'android': { + Var('root_dir') + '/base': + Var('chromium_git') + '/chromium/src/base' + '@' + 'b9d4d9b0e5373bbdb5403c68d51e7385d78a09d0', + Var('root_dir') + '/third_party/android_tools': + Var('chromium_git') + '/android_tools.git' + '@' + 'b43a6a289a7588b1769814f04dd6c7d7176974cc', + Var('root_dir') + '/third_party/ced/src': + Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '368a9cc09ad868a3d28f0b5ad4a733f263c46409', + Var('root_dir') + '/third_party/icu': + Var('chromium_git') + '/chromium/deps/icu.git' + '@' + '9cd2828740572ba6f694b9365236a8356fd06147', + Var('root_dir') + '/third_party/jsr-305/src': + Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919', + Var('root_dir') + '/third_party/junit/src': + Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481', + Var('root_dir') + '/third_party/lss': + Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'), + Var('root_dir') + '/third_party/mockito/src': + Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850', + Var('root_dir') + '/third_party/requests/src': + Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4', + Var('root_dir') + '/third_party/robolectric/robolectric': + Var('chromium_git') + '/external/robolectric.git' + '@' + 'e38b49a12fdfa17a94f0382cc8ffaf69132fd09b', + }, + 'ios': { + Var('root_dir') + '/ios': + Var('chromium_git') + '/chromium/src/ios' + '@' + '291daef6af7764f8475089c65808d52ee50b496e', + }, + 'unix': { + Var('root_dir') + '/third_party/lss': + Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'), + }, + 'win': { + # Dependencies used by libjpeg-turbo + Var('root_dir') + '/third_party/yasm/binaries': + Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881', + }, } # Define rules for which include paths are allowed in our source. include_rules = [ '+gflags' ] +pre_deps_hooks = [ + { + # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8. + # TODO(kjellander): Remove this in March 2017. + 'name': 'cleanup_links', + 'pattern': '.', + 'action': ['python', Var('root_dir') + '/cleanup_links.py'], + }, +] + hooks = [ { - # Clone chromium and its deps. - 'name': 'sync chromium', + # This clobbers when necessary (based on get_landmines.py). It should be + # an early hook but it will need to be run after syncing Chromium and + # setting up the links, so the script actually exists. + 'name': 'landmines', + 'pattern': '.', + 'action': [ + 'python', + Var('root_dir') + '/build/landmines.py', + '--landmine-scripts', + Var('root_dir') + '/tools_libyuv/get_landmines.py', + '--src-dir', + Var('root_dir') + '', + ], + }, + # Android dependencies. Many are downloaded using Google Storage these days. + # They're copied from https://cs.chromium.org/chromium/src/DEPS for all + # such dependencies we share with Chromium. + { + # This downloads SDK extras and puts them in the + # third_party/android_tools/sdk/extras directory. + 'name': 'sdkextras', + 'pattern': '.', + # When adding a new sdk extras package to download, add the package + # directory and zip file to .gitignore in third_party/android_tools. + 'action': ['python', + Var('root_dir') + '/build/android/play_services/update.py', + 'download' + ], + }, + { + 'name': 'intellij', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-intellij', + '-l', 'third_party/intellij' + ], + }, + { + 'name': 'javax_inject', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-javax-inject', + '-l', 'third_party/javax_inject' + ], + }, + { + 'name': 'hamcrest', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-hamcrest', + '-l', 'third_party/hamcrest' + ], + }, + { + 'name': 'guava', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-guava', + '-l', 'third_party/guava' + ], + }, + { + 'name': 'android_support_test_runner', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-android-support-test-runner', + '-l', 'third_party/android_support_test_runner' + ], + }, + { + 'name': 'byte_buddy', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-byte-buddy', + '-l', 'third_party/byte_buddy' + ], + }, + { + 'name': 'espresso', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-espresso', + '-l', 'third_party/espresso' + ], + }, + { + 'name': 'robolectric_libs', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-robolectric', + '-l', 'third_party/robolectric' + ], + }, + { + 'name': 'apache_velocity', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-apache-velocity', + '-l', 'third_party/apache_velocity' + ], + }, + { + 'name': 'ow2_asm', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-ow2-asm', + '-l', 'third_party/ow2_asm' + ], + }, + { + 'name': 'icu4j', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-icu4j', + '-l', 'third_party/icu4j' + ], + }, + { + 'name': 'accessibility_test_framework', 'pattern': '.', - 'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py', - '--target-revision', Var('chromium_revision')], + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-accessibility-test-framework', + '-l', 'third_party/accessibility_test_framework' + ], }, { - # Create links to shared dependencies in Chromium. - 'name': 'setup_links', + 'name': 'bouncycastle', 'pattern': '.', - 'action': ['python', Var('root_dir') + '/setup_links.py'], + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-bouncycastle', + '-l', 'third_party/bouncycastle' + ], }, { - # A change to a .gyp, .gypi, or to GYP itself should run the generator. + 'name': 'sqlite4java', 'pattern': '.', - 'action': ['python', Var('root_dir') + '/gyp_libyuv'], + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-sqlite4java', + '-l', 'third_party/sqlite4java' + ], }, + { + 'name': 'objenesis', + 'pattern': '.', + 'action': ['python', + Var('root_dir') + '/build/android/update_deps/update_third_party_deps.py', + 'download', + '-b', 'chromium-objenesis', + '-l', 'third_party/objenesis' + ], + }, + { + # Downloads the current stable linux sysroot to build/linux/ if needed. + # This sysroot updates at about the same rate that the chrome build deps + # change. This script is a no-op except for linux users who are doing + # official chrome builds or cross compiling. + 'name': 'sysroot', + 'pattern': '.', + 'action': ['python', Var('root_dir') + '/build/linux/sysroot_scripts/install-sysroot.py', + '--running-as-hook'], + }, + { + # Update the Windows toolchain if necessary. + 'name': 'win_toolchain', + 'pattern': '.', + 'action': ['python', Var('root_dir') + '/build/vs_toolchain.py', 'update'], + }, + # Pull binutils for linux, enabled debug fission for faster linking / + # debugging when used with clang on Ubuntu Precise. + # https://code.google.com/p/chromium/issues/detail?id=352046 + { + 'name': 'binutils', + 'pattern': Var('root_dir') + '/third_party/binutils', + 'action': [ + 'python', + Var('root_dir') + '/third_party/binutils/download.py', + ], + }, + { + # Pull clang if needed or requested via GYP_DEFINES. + # Note: On Win, this should run after win_toolchain, as it may use it. + 'name': 'clang', + 'pattern': '.', + 'action': ['python', Var('root_dir') + '/tools/clang/scripts/update.py', '--if-needed'], + }, + { + # Update LASTCHANGE. + 'name': 'lastchange', + 'pattern': '.', + 'action': ['python', Var('root_dir') + '/build/util/lastchange.py', + '-o', Var('root_dir') + '/build/util/LASTCHANGE'], + }, + # Pull GN binaries. + { + 'name': 'gn_win', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=win32', + '--no_auth', + '--bucket', 'chromium-gn', + '-s', Var('root_dir') + '/buildtools/win/gn.exe.sha1', + ], + }, + { + 'name': 'gn_mac', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=darwin', + '--no_auth', + '--bucket', 'chromium-gn', + '-s', Var('root_dir') + '/buildtools/mac/gn.sha1', + ], + }, + { + 'name': 'gn_linux64', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=linux*', + '--no_auth', + '--bucket', 'chromium-gn', + '-s', Var('root_dir') + '/buildtools/linux64/gn.sha1', + ], + }, + # Pull clang-format binaries using checked-in hashes. + { + 'name': 'clang_format_win', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=win32', + '--no_auth', + '--bucket', 'chromium-clang-format', + '-s', Var('root_dir') + '/buildtools/win/clang-format.exe.sha1', + ], + }, + { + 'name': 'clang_format_mac', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=darwin', + '--no_auth', + '--bucket', 'chromium-clang-format', + '-s', Var('root_dir') + '/buildtools/mac/clang-format.sha1', + ], + }, + { + 'name': 'clang_format_linux', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=linux*', + '--no_auth', + '--bucket', 'chromium-clang-format', + '-s', Var('root_dir') + '/buildtools/linux64/clang-format.sha1', + ], + }, + # Pull luci-go binaries (isolate, swarming) using checked-in hashes. + { + 'name': 'luci-go_win', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=win32', + '--no_auth', + '--bucket', 'chromium-luci', + '-d', Var('root_dir') + '/tools/luci-go/win64', + ], + }, + { + 'name': 'luci-go_mac', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=darwin', + '--no_auth', + '--bucket', 'chromium-luci', + '-d', Var('root_dir') + '/tools/luci-go/mac64', + ], + }, + { + 'name': 'luci-go_linux', + 'pattern': '.', + 'action': [ 'download_from_google_storage', + '--no_resume', + '--platform=linux*', + '--no_auth', + '--bucket', 'chromium-luci', + '-d', Var('root_dir') + '/tools/luci-go/linux64', + ], + }, + { + # Pull sanitizer-instrumented third-party libraries if requested via + # GYP_DEFINES. + # See src/third_party/instrumented_libraries/scripts/download_binaries.py. + # TODO(kjellander): Update comment when GYP is completely cleaned up. + 'name': 'instrumented_libraries', + 'pattern': '\\.sha1', + 'action': ['python', Var('root_dir') + '/third_party/instrumented_libraries/scripts/download_binaries.py'], + }, + { + 'name': 'clang_format_merge_driver', + 'pattern': '.', + 'action': [ 'python', + Var('root_dir') + '/tools/clang_format_merge_driver/install_git_hook.py', + ], + }, +] + +recursedeps = [ + # buildtools provides clang_format, libc++, and libc++abi. + Var('root_dir') + '/buildtools', + # android_tools manages the NDK. + Var('root_dir') + '/third_party/android_tools', ] diff --git a/files/OWNERS b/files/OWNERS index c1f7308f..e231f7b0 100644 --- a/files/OWNERS +++ b/files/OWNERS @@ -3,12 +3,12 @@ kjellander@google.com # magjed@chromium.org # torbjorng@chromium.org -per-file *.gyp=kjellander@google.com -per-file *.gn=kjellander@google.com +per-file *.gyp=kjellander@chromium.org +per-file *.gn=kjellander@chromium.org per-file .gitignore=* per-file AUTHORS=* per-file DEPS=* -per-file PRESUBMIT.py=kjellander@google.com -per-file gyp_libyuv.py=kjellander@google.com +per-file PRESUBMIT.py=kjellander@chromium.org +per-file gyp_libyuv.py=kjellander@chromium.org per-file setup_links.py=* -per-file sync_chromium.py=kjellander@google.com +per-file sync_chromium.py=kjellander@chromium.org diff --git a/files/PRESUBMIT.py b/files/PRESUBMIT.py index 58242bd9..2cf1542f 100755 --- a/files/PRESUBMIT.py +++ b/files/PRESUBMIT.py @@ -1,4 +1,4 @@ -# Copyright 2014 The LibYuv Project Authors. All rights reserved. +# Copyright 2017 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source @@ -6,60 +6,67 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import re -import sys +import os -def GetDefaultTryConfigs(bots=None): - """Returns a list of ('bot', set(['tests']), optionally filtered by [bots]. +def _RunPythonTests(input_api, output_api): + def join(*args): + return input_api.os_path.join(input_api.PresubmitLocalPath(), *args) - For WebRTC purposes, we always return an empty list of tests, since we want - to run all tests by default on all our trybots. - """ - return { 'tryserver.libyuv': dict((bot, []) for bot in bots)} + test_directories = [ + root for root, _, files in os.walk(join('tools_libyuv')) + if any(f.endswith('_test.py') for f in files) + ] + tests = [] + for directory in test_directories: + tests.extend( + input_api.canned_checks.GetUnitTestsInDirectory( + input_api, + output_api, + directory, + whitelist=[r'.+_test\.py$'])) + return input_api.RunTests(tests, parallel=True) -# pylint: disable=W0613 -def GetPreferredTryMasters(project, change): - files = change.LocalPaths() - bots = [ - 'win', - 'win_rel', - 'win_x64_rel', - 'win_x64_gn', - 'win_x64_gn_rel', - 'win_clang', - 'win_clang_rel', - 'win_x64_clang_rel', - 'mac', - 'mac_rel', - 'mac_gn', - 'mac_gn_rel', - 'mac_asan', - 'ios', - 'ios_rel', - 'ios_arm64', - 'ios_arm64_rel', - 'linux', - 'linux_rel', - 'linux_gn', - 'linux_gn_rel', - 'linux_memcheck', - 'linux_tsan2', - 'linux_asan', - 'linux_msan', - 'linux_ubsan', - 'linux_ubsan_vptr', - 'android', - 'android_rel', - 'android_clang', - 'android_arm64', - 'android_mips', - 'android_x64', - 'android_x86', - 'android_gn', - 'android_gn_rel', - ] - if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files): - return {} - return GetDefaultTryConfigs(bots) + +def _CommonChecks(input_api, output_api): + """Checks common to both upload and commit.""" + results = [] + results.extend(input_api.canned_checks.RunPylint(input_api, output_api, + black_list=(r'^base[\\\/].*\.py$', + r'^build[\\\/].*\.py$', + r'^buildtools[\\\/].*\.py$', + r'^ios[\\\/].*\.py$', + r'^out.*[\\\/].*\.py$', + r'^testing[\\\/].*\.py$', + r'^third_party[\\\/].*\.py$', + r'^tools[\\\/].*\.py$', + # TODO(kjellander): should arguably be checked. + r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$', + r'^xcodebuild.*[\\\/].*\.py$',), + disabled_warnings=['F0401', # Failed to import x + 'E0611', # No package y in x + 'W0232', # Class has no __init__ method + ], + pylintrc='pylintrc')) + results.extend(_RunPythonTests(input_api, output_api)) + return results + + +def CheckChangeOnUpload(input_api, output_api): + results = [] + results.extend(_CommonChecks(input_api, output_api)) + results.extend( + input_api.canned_checks.CheckGNFormatted(input_api, output_api)) + return results + + +def CheckChangeOnCommit(input_api, output_api): + results = [] + results.extend(_CommonChecks(input_api, output_api)) + results.extend(input_api.canned_checks.CheckOwners(input_api, output_api)) + results.extend(input_api.canned_checks.CheckChangeWasUploaded( + input_api, output_api)) + results.extend(input_api.canned_checks.CheckChangeHasDescription( + input_api, output_api)) + return results diff --git a/files/README.chromium b/files/README.chromium index 251f8676..b502436f 100644 --- a/files/README.chromium +++ b/files/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1602 +Version: 1645 License: BSD License File: LICENSE diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni index 6d3aa1eb..0a6affbf 100644 --- a/files/build_overrides/build.gni +++ b/files/build_overrides/build.gni @@ -13,3 +13,34 @@ # remove this when Chromium drops 10.6 support and also requires 10.7. mac_sdk_min_build_override = "10.11" mac_deployment_target_build_override = "10.7" + +# Some non-Chromium builds don't use Chromium's third_party/binutils. +linux_use_bundled_binutils_override = true + +# Variable that can be used to support multiple build scenarios, like having +# Chromium specific targets in a client project's GN file etc. +build_with_chromium = false + +# Some non-Chromium builds don't support building java targets. +enable_java_templates = true + +# Allow using custom suppressions files (currently not used by libyuv). +asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc" +lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc" +tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc" + +msan_blacklist_path = + rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir) +ubsan_blacklist_path = + rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir) +ubsan_vptr_blacklist_path = + rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir) + +# For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size +# limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem +# so we just ignore that assert. See https://crbug.com/648948 for more info. +ignore_elf32_limitations = true + +# Use system Xcode installation instead of the Chromium bundled Mac toolchain, +# since it contains only SDK 10.11, not 10.12 which WebRTC needs. +use_system_xcode = true diff --git a/files/build_overrides/gtest.gni b/files/build_overrides/gtest.gni new file mode 100644 index 00000000..d3c3f68c --- /dev/null +++ b/files/build_overrides/gtest.gni @@ -0,0 +1,19 @@ +# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# Include support for registering main function in multi-process tests. +gtest_include_multiprocess = true + +# Include support for platform-specific operations across unit tests. +gtest_include_platform_test = true + +# Exclude support for testing Objective C code on OS X and iOS. +gtest_include_objc_support = true + +# Exclude support for flushing coverage files on iOS. +gtest_include_ios_coverage = true diff --git a/files/cleanup_links.py b/files/cleanup_links.py new file mode 100755 index 00000000..ba290789 --- /dev/null +++ b/files/cleanup_links.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# Copyright 2017 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# This is a copy of the file from WebRTC in: +# https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py + +"""Script to cleanup symlinks created from setup_links.py. + +Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium +checkout which we created symlinks into. In order to do clean syncs after +landing that change, this script cleans up any old symlinks, avoiding annoying +manual cleanup needed in order to complete gclient sync. +""" + +import logging +import optparse +import os +import shelve +import subprocess +import sys + + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +LINKS_DB = 'links' + +# Version management to make future upgrades/downgrades easier to support. +SCHEMA_VERSION = 1 + +class WebRTCLinkSetup(object): + def __init__(self, links_db, dry_run=False): + self._dry_run = dry_run + self._links_db = links_db + + def CleanupLinks(self): + logging.debug('CleanupLinks') + for source, link_path in self._links_db.iteritems(): + if source == 'SCHEMA_VERSION': + continue + if os.path.islink(link_path) or sys.platform.startswith('win'): + # os.path.islink() always returns false on Windows + # See http://bugs.python.org/issue13143. + logging.debug('Removing link to %s at %s', source, link_path) + if not self._dry_run: + if os.path.exists(link_path): + if sys.platform.startswith('win') and os.path.isdir(link_path): + subprocess.check_call(['rmdir', '/q', '/s', link_path], + shell=True) + else: + os.remove(link_path) + del self._links_db[source] + + +def _initialize_database(filename): + links_database = shelve.open(filename) + # Wipe the database if this version of the script ends up looking at a + # newer (future) version of the links db, just to be sure. + version = links_database.get('SCHEMA_VERSION') + if version and version != SCHEMA_VERSION: + logging.info('Found database with schema version %s while this script only ' + 'supports %s. Wiping previous database contents.', version, + SCHEMA_VERSION) + links_database.clear() + links_database['SCHEMA_VERSION'] = SCHEMA_VERSION + return links_database + + +def main(): + parser = optparse.OptionParser() + parser.add_option('-d', '--dry-run', action='store_true', default=False, + help='Print what would be done, but don\'t perform any ' + 'operations. This will automatically set logging to ' + 'verbose.') + parser.add_option('-v', '--verbose', action='store_const', + const=logging.DEBUG, default=logging.INFO, + help='Print verbose output for debugging.') + options, _ = parser.parse_args() + + if options.dry_run: + options.verbose = logging.DEBUG + logging.basicConfig(format='%(message)s', level=options.verbose) + + # Work from the root directory of the checkout. + script_dir = os.path.dirname(os.path.abspath(__file__)) + os.chdir(script_dir) + + # The database file gets .db appended on some platforms. + db_filenames = [LINKS_DB, LINKS_DB + '.db'] + if any(os.path.isfile(f) for f in db_filenames): + links_database = _initialize_database(LINKS_DB) + try: + symlink_creator = WebRTCLinkSetup(links_database, options.dry_run) + symlink_creator.CleanupLinks() + finally: + for f in db_filenames: + if os.path.isfile(f): + os.remove(f) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/files/codereview.settings b/files/codereview.settings index 9b538069..00ba1d37 100644 --- a/files/codereview.settings +++ b/files/codereview.settings @@ -1,12 +1,6 @@ -# This file is used by gcl to get repository specific information. +# This file is used by git cl to get repository specific information. CODE_REVIEW_SERVER: codereview.chromium.org -#CC_LIST: -VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/ -#STATUS: -FORCE_HTTPS_COMMIT_URL: True +GERRIT_HOST: True PROJECT: libyuv TRY_ON_UPLOAD: False -TRYSERVER_ROOT: src -TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv -#GITCL_PREUPLOAD: -#GITCL_PREDCOMMIT: +VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/ diff --git a/files/docs/deprecated_builds.md b/files/docs/deprecated_builds.md new file mode 100644 index 00000000..f623e50c --- /dev/null +++ b/files/docs/deprecated_builds.md @@ -0,0 +1,440 @@ +# Deprecated Builds + +Older documentation on build configs which are no longer supported. + +## Pre-requisites + +You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools +Refer to chromium instructions for each platform for other prerequisites. + +## Getting the Code + +Create a working directory, enter it, and run: + + gclient config https://chromium.googlesource.com/libyuv/libyuv + gclient sync + + +Then you'll get a .gclient file like: + + solutions = [ + { "name" : "libyuv", + "url" : "https://chromium.googlesource.com/libyuv/libyuv", + "deps_file" : "DEPS", + "managed" : True, + "custom_deps" : { + }, + "safesync_url": "", + }, + ]; + + +For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.` + +Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master + +### Android +For Android add `;target_os=['android'];` to your Linux .gclient + + + solutions = [ + { "name" : "libyuv", + "url" : "https://chromium.googlesource.com/libyuv/libyuv", + "deps_file" : "DEPS", + "managed" : True, + "custom_deps" : { + }, + "safesync_url": "", + }, + ]; + target_os = ["android", "unix"]; + +Then run: + + export GYP_DEFINES="OS=android" + gclient sync + +Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following: + + cd chromium/src + ./build/android/play_services/update.py download + cd ../.. + +For Windows the gclient sync must be done from an Administrator command prompt. + +The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks` + +To get just the source (not buildable): + + git clone https://chromium.googlesource.com/libyuv/libyuv + + +## Building the Library and Unittests + +### Windows + + set GYP_DEFINES=target_arch=ia32 + call python gyp_libyuv -fninja -G msvs_version=2013 + ninja -j7 -C out\Release + ninja -j7 -C out\Debug + + set GYP_DEFINES=target_arch=x64 + call python gyp_libyuv -fninja -G msvs_version=2013 + ninja -C out\Debug_x64 + ninja -C out\Release_x64 + +#### Building with clangcl + set GYP_DEFINES=clang=1 target_arch=ia32 + call python tools\clang\scripts\update.py + call python gyp_libyuv -fninja libyuv_test.gyp + ninja -C out\Debug + ninja -C out\Release + +### OSX + +Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit. + + GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv + ninja -j7 -C out/Debug + ninja -j7 -C out/Release + + GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv + ninja -j7 -C out/Debug + ninja -j7 -C out/Release + +### iOS +http://www.chromium.org/developers/how-tos/build-instructions-ios + +Add to .gclient last line: `target_os=['ios'];` + +armv7 + + GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv + ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest + ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest + +arm64 + + GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv + ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest + ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest + +both armv7 and arm64 (fat) + + GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv + ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest + ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest + +simulator + + GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv + ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest + ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest + +### Android +https://code.google.com/p/chromium/wiki/AndroidBuildInstructions + +Add to .gclient last line: `target_os=['android'];` + +armv7 + + GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv + ninja -j7 -C out/Debug yuv_unittest_apk + ninja -j7 -C out/Release yuv_unittest_apk + +arm64 + + GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv + ninja -j7 -C out/Debug yuv_unittest_apk + ninja -j7 -C out/Release yuv_unittest_apk + +ia32 + + GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv + ninja -j7 -C out/Debug yuv_unittest_apk + ninja -j7 -C out/Release yuv_unittest_apk + + GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv + ninja -j7 -C out/Debug yuv_unittest_apk + +mipsel + + GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv + ninja -j7 -C out/Debug yuv_unittest_apk + ninja -j7 -C out/Release yuv_unittest_apk + +arm32 disassembly: + + third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o + +arm64 disassembly: + + third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o + +Running tests: + + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* + +Running test as benchmark: + + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1" + +Running test with C code: + + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1" + +#### Building with GN + + gn gen out/Release "--args=is_debug=false target_cpu=\"x86\"" + gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\"" + ninja -C out/Release + ninja -C out/Debug + +### Building Offical with GN + + gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true" + ninja -C out/Official + +#### Building mips with GN + +mipsel + gn gen out/Default "--args=is_debug=false target_cpu=\"mipsel\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false" + ninja -C out/Default + +mips64el + gn gen out/Default "--args=is_debug=false target_cpu=\"mips64el\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false" + ninja -C out/Default + +### Linux + + GYP_DEFINES="target_arch=x64" ./gyp_libyuv + ninja -j7 -C out/Debug + ninja -j7 -C out/Release + + GYP_DEFINES="target_arch=ia32" ./gyp_libyuv + ninja -j7 -C out/Debug + ninja -j7 -C out/Release + +#### CentOS + +On CentOS 32 bit the following work around allows a sync: + + export GYP_DEFINES="host_arch=ia32" + gclient sync + +### Windows Shared Library + +Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. + + gclient runhooks + +After this command follow the building the library instructions above. + +If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows + + +### Build targets + + ninja -C out/Debug libyuv + ninja -C out/Debug libyuv_unittest + ninja -C out/Debug compare + ninja -C out/Debug convert + ninja -C out/Debug psnr + ninja -C out/Debug cpuid + + +## Building the Library with make + +### Linux + + make -j7 V=1 -f linux.mk + make -j7 V=1 -f linux.mk clean + make -j7 V=1 -f linux.mk CXX=clang++ + +## Building the Library with cmake + +Install cmake: http://www.cmake.org/ + +Default debug build: + + mkdir out + cd out + cmake .. + cmake --build . + +Release build/install + + mkdir out + cd out + cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" .. + cmake --build . --config Release + sudo cmake --build . --target install --config Release + +### Windows 8 Phone + +Pre-requisite: + +* Install Visual Studio 2012 and Arm to your environment.<br> + +Then: + + call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat" + +or with Visual Studio 2013: + + call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat" + nmake /f winarm.mk clean + nmake /f winarm.mk + +### Windows Shared Library + +Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this. + + gclient runhooks + +After this command follow the building the library instructions above. + +If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows + +### 64 bit Windows + + set GYP_DEFINES=target_arch=x64 + gclient runhooks V=1 + +### ARM Linux + + export GYP_DEFINES="target_arch=arm" + export CROSSTOOL=`<path>`/arm-none-linux-gnueabi + export CXX=$CROSSTOOL-g++ + export CC=$CROSSTOOL-gcc + export AR=$CROSSTOOL-ar + export AS=$CROSSTOOL-as + export RANLIB=$CROSSTOOL-ranlib + gclient runhooks + +## Running Unittests + +### Windows + + out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*" + +### OSX + + out/Release/libyuv_unittest --gtest_filter="*" + +### Linux + + out/Release/libyuv_unittest --gtest_filter="*" + +Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g. + + out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt + +## CPU Emulator tools + +### Intel SDE (Software Development Emulator) + +Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator + +Then run: + + c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=* + + +## Memory tools + +### Running Dr Memory memcheck for Windows + +Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html + + set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32 + call python gyp_libyuv -fninja -G msvs_version=2013 + ninja -C out\Debug + drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* + +### Running UBSan + +See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer + +Sanitizers available: TSan, MSan, ASan, UBSan, LSan + + GYP_DEFINES='ubsan=1' gclient runhooks + ninja -C out/Release + +### Running Valgrind memcheck + +Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance. + +[1]: http://valgrind.org + + solutions = [ + { "name" : "libyuv", + "url" : "https://chromium.googlesource.com/libyuv/libyuv", + "deps_file" : "DEPS", + "managed" : True, + "custom_deps" : { + "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries", + }, + "safesync_url": "", + }, + ] + +Then run: + + GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv + ninja -C out/Debug + valgrind out/Debug/libyuv_unittest + + +For more information, see http://www.chromium.org/developers/how-tos/using-valgrind + +### Running Thread Sanitizer (TSan) + + GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv + ninja -C out/Debug + valgrind out/Debug/libyuv_unittest + +For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer + +### Running Address Sanitizer (ASan) + + GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv + ninja -C out/Debug + valgrind out/Debug/libyuv_unittest + +For more info, see http://dev.chromium.org/developers/testing/addresssanitizer + +## Benchmarking + +The unittests can be used to benchmark. + +### Windows + + set LIBYUV_WIDTH=1280 + set LIBYUV_HEIGHT=720 + set LIBYUV_REPEAT=999 + set LIBYUV_FLAGS=-1 + out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt + + +### Linux and Mac + + LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt + + libyuvTest.I420ToARGB_Opt (547 ms) + +Indicates 0.547 ms/frame for 1280 x 720. + +## Making a change + + gclient sync + git checkout -b mycl -t origin/master + git pull + <edit files> + git add -u + git commit -m "my change" + git cl lint + git cl try + git cl upload -r a-reviewer@chomium.org -s + <once approved..> + git cl land diff --git a/files/docs/formats.md b/files/docs/formats.md index a7cfed82..cddfe027 100644 --- a/files/docs/formats.md +++ b/files/docs/formats.md @@ -37,20 +37,18 @@ This is how OSX formats map to libyuv The following is extracted from video_common.h as a complete list of formats supported by libyuv. enum FourCC { - // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I411 = FOURCC('I', '4', '1', '1'), FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), - // 2 Secondary YUV formats: row biplanar. + // 1 Secondary YUV formats: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), @@ -102,6 +100,15 @@ The following is extracted from video_common.h as a complete list of formats sup // 1 Auxiliary compressed YUV format set aside for capturer. FOURCC_H264 = FOURCC('H', '2', '6', '4'), +# Planar YUV + The following formats contains a full size Y plane followed by 1 or 2 + planes for UV: I420, I422, I444, I400, NV21, NV12, I400 + The size (subsampling) of the UV varies. + I420, NV12 and NV21 are half width, half height + I422, NV16 and NV61 are half width, full height + I444, NV24 and NV42 are full width, full height + I400 and J400 have no chroma channel. + # The ARGB FOURCC There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers. diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md index 7cd56167..46c591b6 100644 --- a/files/docs/getting_started.md +++ b/files/docs/getting_started.md @@ -14,7 +14,6 @@ Create a working directory, enter it, and run: gclient config https://chromium.googlesource.com/libyuv/libyuv gclient sync - Then you'll get a .gclient file like: solutions = [ @@ -28,7 +27,6 @@ Then you'll get a .gclient file like: }, ]; - For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.` Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master @@ -36,7 +34,6 @@ Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/ma ### Android For Android add `;target_os=['android'];` to your Linux .gclient - solutions = [ { "name" : "libyuv", "url" : "https://chromium.googlesource.com/libyuv/libyuv", @@ -47,7 +44,7 @@ For Android add `;target_os=['android'];` to your Linux .gclient "safesync_url": "", }, ]; - target_os = ["android", "unix"]; + target_os = ["android", "linux"]; Then run: @@ -55,6 +52,7 @@ Then run: gclient sync Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following: + cd chromium/src ./build/android/play_services/update.py download cd ../.. @@ -64,6 +62,7 @@ For Windows the gclient sync must be done from an Administrator command prompt. The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks` To get just the source (not buildable): + git clone https://chromium.googlesource.com/libyuv/libyuv @@ -71,64 +70,61 @@ To get just the source (not buildable): ### Windows - set GYP_DEFINES=target_arch=ia32 - call python gyp_libyuv -fninja -G msvs_version=2013 - ninja -j7 -C out\Release - ninja -j7 -C out\Debug + call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\"" + call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\"" + ninja -v -C out/Release + ninja -v -C out/Debug + + call gn gen out/Release "--args=is_debug=false target_cpu=\"x64\"" + call gn gen out/Debug "--args=is_debug=true target_cpu=\"x64\"" + ninja -v -C out/Release + ninja -v -C out/Debug - set GYP_DEFINES=target_arch=x64 - call python gyp_libyuv -fninja -G msvs_version=2013 - ninja -C out\Debug_x64 - ninja -C out\Release_x64 +#### Building with clang-cl -#### Building with clangcl - set GYP_DEFINES=clang=1 target_arch=ia32 libyuv_enable_svn=1 - set LLVM_REPO_URL=svn://svn.chromium.org/llvm-project + set GYP_DEFINES=clang=1 target_arch=ia32 call python tools\clang\scripts\update.py - call python gyp_libyuv -fninja libyuv_test.gyp - ninja -C out\Debug - ninja -C out\Release -### OSX + call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x86\"" + call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x86\"" + ninja -v -C out/Release + ninja -v -C out/Debug -Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit. + call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x64\"" + call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x64\"" + ninja -v -C out/Release + ninja -v -C out/Debug - GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv - ninja -j7 -C out/Debug - ninja -j7 -C out/Release +### macOS and Linux - GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv - ninja -j7 -C out/Debug - ninja -j7 -C out/Release + gn gen out/Release "--args=is_debug=false" + gn gen out/Debug "--args=is_debug=true" + ninja -v -C out/Release + ninja -v -C out/Debug + +### Building Offical with GN + + gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true" + ninja -C out/Official ### iOS http://www.chromium.org/developers/how-tos/build-instructions-ios Add to .gclient last line: `target_os=['ios'];` -armv7 - - GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv - ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest - ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest - arm64 - GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv - ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest - ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest - -both armv7 and arm64 (fat) + gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\"" + gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest - GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv - ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest - ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest +ios simulator -simulator - - GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv - ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest - ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest + gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\"" + gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest ### Android https://code.google.com/p/chromium/wiki/AndroidBuildInstructions @@ -137,90 +133,56 @@ Add to .gclient last line: `target_os=['android'];` armv7 - GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv - ninja -j7 -C out/Debug libyuv_unittest_apk - ninja -j7 -C out/Release libyuv_unittest_apk + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\"" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest arm64 - GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv - ninja -j7 -C out/Debug libyuv_unittest_apk - ninja -j7 -C out/Release libyuv_unittest_apk + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\"" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest ia32 - GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv - ninja -j7 -C out/Debug libyuv_unittest_apk - ninja -j7 -C out/Release libyuv_unittest_apk - - GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv - ninja -j7 -C out/Debug libyuv_unittest_apk + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"x86\"" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"x86\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest mipsel - GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv - ninja -j7 -C out/Debug libyuv_unittest_apk - ninja -j7 -C out/Release libyuv_unittest_apk + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest + + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest -arm32 disassembly: +arm disassembly: - third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o + third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt -arm64 disassembly: + third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt - third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o + third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt Running tests: - util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* Running test as benchmark: - util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1" + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1" Running test with C code: - util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1" - -#### Building with GN - - gn gen out/Release "--args=is_debug=false target_cpu=\"x86\"" - gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\"" - ninja -C out/Release - ninja -C out/Debug - -### Building Offical with GN - - gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true" - ninja -C out/Official - -### Linux - - GYP_DEFINES="target_arch=x64" ./gyp_libyuv - ninja -j7 -C out/Debug - ninja -j7 -C out/Release - - GYP_DEFINES="target_arch=ia32" ./gyp_libyuv - ninja -j7 -C out/Debug - ninja -j7 -C out/Release - -#### CentOS - -On CentOS 32 bit the following work around allows a sync: - - export GYP_DEFINES="host_arch=ia32" - gclient sync - -### Windows Shared Library - -Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. - - gclient runhooks - -After this command follow the building the library instructions above. - -If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows - + build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1" ### Build targets @@ -231,27 +193,33 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium ninja -C out/Debug psnr ninja -C out/Debug cpuid +### ARM Linux + + gn gen out/Release "--args=is_debug=false target_cpu=\"arm64\"" + gn gen out/Debug "--args=is_debug=true target_cpu=\"arm64\"" + ninja -v -C out/Debug libyuv_unittest + ninja -v -C out/Release libyuv_unittest ## Building the Library with make ### Linux - make -j7 V=1 -f linux.mk - make -j7 V=1 -f linux.mk clean - make -j7 V=1 -f linux.mk CXX=clang++ + make V=1 -f linux.mk + make V=1 -f linux.mk clean + make V=1 -f linux.mk CXX=clang++ -## Building the Library with cmake +## Building the library with cmake Install cmake: http://www.cmake.org/ -Default debug build: +### Default debug build: mkdir out cd out cmake .. cmake --build . -Release build/install +### Release build/install mkdir out cd out @@ -259,47 +227,31 @@ Release build/install cmake --build . --config Release sudo cmake --build . --target install --config Release -### Windows 8 Phone - -Pre-requisite: - -* Install Visual Studio 2012 and Arm to your environment.<br> - -Then: - - call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat" - -or with Visual Studio 2013: - - call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat" - nmake /f winarm.mk clean - nmake /f winarm.mk - -### Windows Shared Library +### Build RPM/DEB packages -Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this. - - gclient runhooks - -After this command follow the building the library instructions above. + mkdir out + cd out + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j4 + make package -If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows +## Setup for Arm Cross compile -### 64 bit Windows +See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html - set GYP_DEFINES=target_arch=x64 - gclient runhooks V=1 + sudo apt-get install ssh dkms build-essential linux-headers-generic + sudo apt-get install kdevelop cmake git subversion + sudo apt-get install graphviz doxygen doxygen-gui + sudo apt-get install manpages manpages-dev manpages-posix manpages-posix-dev + sudo apt-get install libboost-all-dev libboost-dev libssl-dev + sudo apt-get install rpm terminator fish + sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf -### ARM Linux +### Build psnr tool - export GYP_DEFINES="target_arch=arm" - export CROSSTOOL=`<path>`/arm-none-linux-gnueabi - export CXX=$CROSSTOOL-g++ - export CC=$CROSSTOOL-gcc - export AR=$CROSSTOOL-ar - export AS=$CROSSTOOL-as - export RANLIB=$CROSSTOOL-ranlib - gclient runhooks + cd util + arm-linux-gnueabihf-g++ psnr_main.cc psnr.cc ssim.cc -o psnr + arm-linux-gnueabihf-objdump -d psnr ## Running Unittests @@ -317,113 +269,29 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g. - out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt + out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt ## CPU Emulator tools ### Intel SDE (Software Development Emulator) -Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator +Pre-requisite: Install IntelSDE: http://software.intel.com/en-us/articles/intel-software-development-emulator Then run: - c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=* + c:\intelsde\sde -hsw -- out\Release\libyuv_unittest.exe --gtest_filter=* + + ~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt +## Sanitizers -## Memory tools + gn gen out/Debug "--args=is_debug=true is_asan=true" + ninja -v -C out/Debug + + Sanitizers available: tsan, msan, asan, ubsan, lsan ### Running Dr Memory memcheck for Windows Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html - set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32 - call python gyp_libyuv -fninja -G msvs_version=2013 - ninja -C out\Debug drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* - -### Running UBSan - -See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer - -Sanitizers available: TSan, MSan, ASan, UBSan, LSan - - GYP_DEFINES='ubsan=1' gclient runhooks - ninja -C out/Release - -### Running Valgrind memcheck - -Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance. - -[1]: http://valgrind.org - - solutions = [ - { "name" : "libyuv", - "url" : "https://chromium.googlesource.com/libyuv/libyuv", - "deps_file" : "DEPS", - "managed" : True, - "custom_deps" : { - "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries", - }, - "safesync_url": "", - }, - ] - -Then run: - - GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv - ninja -C out/Debug - valgrind out/Debug/libyuv_unittest - - -For more information, see http://www.chromium.org/developers/how-tos/using-valgrind - -### Running Thread Sanitizer (TSan) - - GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv - ninja -C out/Debug - valgrind out/Debug/libyuv_unittest - -For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer - -### Running Address Sanitizer (ASan) - - GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv - ninja -C out/Debug - valgrind out/Debug/libyuv_unittest - -For more info, see http://dev.chromium.org/developers/testing/addresssanitizer - -## Benchmarking - -The unittests can be used to benchmark. - -### Windows - - set LIBYUV_WIDTH=1280 - set LIBYUV_HEIGHT=720 - set LIBYUV_REPEAT=999 - set LIBYUV_FLAGS=-1 - out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt - - -### Linux and Mac - - LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt - - libyuvTest.I420ToARGB_Opt (547 ms) - -Indicates 0.547 ms/frame for 1280 x 720. - -## Making a change - - gclient sync - git checkout -b mycl -t origin/master - git pull - <edit files> - git add -u - git commit -m "my change" - git cl lint - git cl try - git cl upload -r a-reviewer@chomium.org -s - <once approved..> - git cl land diff --git a/files/gyp_libyuv.py b/files/gyp_libyuv.py index ac42038d..bb32ec39 100644 --- a/files/gyp_libyuv.py +++ b/files/gyp_libyuv.py @@ -9,7 +9,7 @@ # be found in the AUTHORS file in the root of the source tree. -# This script is a modified copy of the src/build/gyp_chromium.py file. +# This script is a modified copy of the src/build/gyp_chromium.py file. # It is needed for parallel processing. # This file is (possibly, depending on python version) imported by diff --git a/files/include/libyuv.h b/files/include/libyuv.h index de652836..aeffd5ef 100644 --- a/files/include/libyuv.h +++ b/files/include/libyuv.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_H_ #define INCLUDE_LIBYUV_H_ #include "libyuv/basic_types.h" @@ -29,4 +29,4 @@ #include "libyuv/version.h" #include "libyuv/video_common.h" -#endif // INCLUDE_LIBYUV_H_ NOLINT +#endif // INCLUDE_LIBYUV_H_ diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h index beb750ba..7d98bb93 100644 --- a/files/include/libyuv/basic_types.h +++ b/files/include/libyuv/basic_types.h @@ -8,12 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ #define INCLUDE_LIBYUV_BASIC_TYPES_H_ #include <stddef.h> // for NULL, size_t -#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600)) +#if defined(_MSC_VER) && (_MSC_VER < 1600) #include <sys/types.h> // for uintptr_t on x86 #else #include <stdint.h> // for uintptr_t @@ -26,31 +26,31 @@ typedef unsigned __int64 uint64; typedef __int64 int64; #ifndef INT64_C -#define INT64_C(x) x ## I64 +#define INT64_C(x) x##I64 #endif #ifndef UINT64_C -#define UINT64_C(x) x ## UI64 +#define UINT64_C(x) x##UI64 #endif #define INT64_F "I64" #else // COMPILER_MSVC #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long uint64; // NOLINT -typedef long int64; // NOLINT +typedef long int64; // NOLINT #ifndef INT64_C -#define INT64_C(x) x ## L +#define INT64_C(x) x##L #endif #ifndef UINT64_C -#define UINT64_C(x) x ## UL +#define UINT64_C(x) x##UL #endif #define INT64_F "l" #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long long uint64; // NOLINT -typedef long long int64; // NOLINT +typedef long long int64; // NOLINT #ifndef INT64_C -#define INT64_C(x) x ## LL +#define INT64_C(x) x##LL #endif #ifndef UINT64_C -#define UINT64_C(x) x ## ULL +#define UINT64_C(x) x##ULL #endif #define INT64_F "ll" #endif // __LP64__ @@ -58,15 +58,15 @@ typedef long long int64; // NOLINT typedef unsigned int uint32; typedef int int32; typedef unsigned short uint16; // NOLINT -typedef short int16; // NOLINT +typedef short int16; // NOLINT typedef unsigned char uint8; typedef signed char int8; #endif // INT_TYPES_DEFINED #endif // GG_LONGLONG // Detect compiler is for x86 or x64. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) #define CPU_X86 1 #endif // Detect compiler is for ARM. @@ -76,12 +76,12 @@ typedef signed char int8; #ifndef ALIGNP #ifdef __cplusplus -#define ALIGNP(p, t) \ - (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \ - ((t) - 1)) & ~((t) - 1)))) +#define ALIGNP(p, t) \ + reinterpret_cast<uint8*>( \ + ((reinterpret_cast<uintptr_t>(p) + ((t)-1)) & ~((t)-1))) #else #define ALIGNP(p, t) \ - ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ + (uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */ #endif #endif @@ -95,9 +95,9 @@ typedef signed char int8; #define LIBYUV_API #endif // LIBYUV_BUILDING_SHARED_LIBRARY #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ - (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ - defined(LIBYUV_USING_SHARED_LIBRARY)) -#define LIBYUV_API __attribute__ ((visibility ("default"))) + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__((visibility("default"))) #else #define LIBYUV_API #endif // __GNUC__ @@ -108,11 +108,10 @@ typedef signed char int8; #define LIBYUV_TRUE 1 // Visual C x86 or GCC little endian. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) || \ - defined(__arm__) || defined(_M_ARM) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define LIBYUV_LITTLE_ENDIAN #endif -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h index 08b2bb2e..4deca97f 100644 --- a/files/include/libyuv/compare.h +++ b/files/include/libyuv/compare.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_COMPARE_H_ #define INCLUDE_LIBYUV_COMPARE_H_ #include "libyuv/basic_types.h" @@ -29,13 +29,15 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height); // Sum Square Error - used to compute Mean Square Error or PSNR. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, - const uint8* src_b, int count); +uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count); LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height); static const int kMaxPsnr = 128; @@ -43,36 +45,56 @@ LIBYUV_API double SumSquareErrorToPsnr(uint64 sse, uint64 count); LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFramePsnr(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Psnr(const uint8* src_y_a, + int stride_y_a, + const uint8* src_u_a, + int stride_u_a, + const uint8* src_v_a, + int stride_v_a, + const uint8* src_y_b, + int stride_y_b, + const uint8* src_u_b, + int stride_u_b, + const uint8* src_v_b, + int stride_v_b, + int width, + int height); LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFrameSsim(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Ssim(const uint8* src_y_a, + int stride_y_a, + const uint8* src_u_a, + int stride_u_a, + const uint8* src_v_a, + int stride_v_a, + const uint8* src_y_b, + int stride_y_b, + const uint8* src_u_b, + int stride_u_b, + const uint8* src_v_b, + int stride_v_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT +#endif // INCLUDE_LIBYUV_COMPARE_H_ diff --git a/files/include/libyuv/compare_row.h b/files/include/libyuv/compare_row.h index 38a957b2..7abc2d4a 100644 --- a/files/include/libyuv/compare_row.h +++ b/files/include/libyuv/compare_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ #define INCLUDE_LIBYUV_COMPARE_ROW_H_ #include "libyuv/basic_types.h" @@ -30,8 +30,8 @@ extern "C" { #endif // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -42,8 +42,8 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ -#if !defined(LIBYUV_DISABLE_X86) && \ - defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 #endif @@ -81,4 +81,4 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h index a2cdc571..f096d193 100644 --- a/files/include/libyuv/convert.h +++ b/files/include/libyuv/convert.h @@ -8,13 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" // For enum RotationMode. +// TODO(fbarchard): fix WebRTC source to include following libyuv headers: +#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 +#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 +#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 + #ifdef __cplusplus namespace libyuv { extern "C" { @@ -22,184 +27,295 @@ extern "C" { // Convert I444 to I420. LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I444ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert I422 to I420. LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert I411 to I420. -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I422ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Copy I420 to I420. #define I420ToI420 I420Copy LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert I400 (grey) to I420. LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I400ToI420(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); #define J400ToJ420 I400ToI420 // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV12ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert NV21 to I420. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV21ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_vu, + int src_stride_vu, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int YUY2ToI420(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int UYVYToI420(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int M420ToI420(const uint8* src_m420, + int src_stride_m420, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert Android420 to I420. +LIBYUV_API +int Android420ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + int pixel_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // ARGB little endian (bgra in memory) to I420. LIBYUV_API -int ARGBToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // BGRA little endian (argb in memory) to I420. LIBYUV_API -int BGRAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int BGRAToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // ABGR little endian (rgba in memory) to I420. LIBYUV_API -int ABGRToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ABGRToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGBA little endian (abgr in memory) to I420. LIBYUV_API -int RGBAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGBAToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGB little endian (bgr in memory) to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB24ToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGB big endian (rgb in memory) to I420. LIBYUV_API -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RAWToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB565ToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGB15 (RGBO fourcc) little endian to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB1555ToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // RGB12 (R444 fourcc) little endian to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB4444ToI420(const uint8* src_frame, + int src_stride_frame, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); #ifdef HAVE_JPEG // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToI420(const uint8* sample, size_t sample_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToI420(const uint8* sample, + size_t sample_size, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height); // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height); +int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height); #endif // Convert camera sample to I420 with cropping, rotation and vertical flip. @@ -225,13 +341,20 @@ int MJPGSize(const uint8* sample, size_t sample_size, // "format" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToI420(const uint8* src_frame, size_t src_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToI420(const uint8* src_frame, + size_t src_size, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, uint32 format); @@ -240,4 +363,4 @@ int ConvertToI420(const uint8* src_frame, size_t src_size, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h index 996f4768..f43a5060 100644 --- a/files/include/libyuv/convert_argb.h +++ b/files/include/libyuv/convert_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #include "libyuv/basic_types.h" @@ -30,245 +30,385 @@ extern "C" { // Copy ARGB to ARGB. LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Duplicate prototype for function in convert_from.h for remoting. +LIBYUV_API +int I420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J444ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate); +int I420AlphaToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + const uint8* src_a, + int src_stride_a, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate); // Convert I420 with Alpha to preattenuated ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate); +int I420AlphaToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + const uint8* src_a, + int src_stride_a, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I400ToARGB(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J400 (jpeg grey) to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J400ToARGB(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Alias. #define YToARGB I400ToARGB // Convert NV12 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV12ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert NV21 to ARGB. LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV21ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_vu, + int src_stride_vu, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int M420ToARGB(const uint8* src_m420, + int src_stride_m420, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int YUY2ToARGB(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int UYVYToARGB(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // BGRA little endian (argb in memory) to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int BGRAToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // ABGR little endian (rgba in memory) to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ABGRToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // RGBA little endian (abgr in memory) to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGBAToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Deprecated function name. #define BG24ToARGB RGB24ToARGB // RGB little endian (bgr in memory) to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB24ToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB big endian (rgb in memory) to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RAWToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB16 (RGBP fourcc) little endian to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB565ToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB15 (RGBO fourcc) little endian to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB1555ToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB12 (R444 fourcc) little endian to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB4444ToARGB(const uint8* src_frame, + int src_stride_frame, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); +#ifdef HAVE_JPEG // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToARGB(const uint8* sample, size_t sample_size, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToARGB(const uint8* sample, + size_t sample_size, + uint8* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height); +#endif // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. @@ -293,11 +433,16 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, // "format" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToARGB(const uint8* src_frame, size_t src_size, - uint8* dst_argb, int dst_stride_argb, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8* src_frame, + size_t src_size, + uint8* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, uint32 format); @@ -306,4 +451,4 @@ int ConvertToARGB(const uint8* src_frame, size_t src_size, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h index 7522ea5c..7ddebd4f 100644 --- a/files/include/libyuv/convert_from.h +++ b/files/include/libyuv/convert_from.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_H_ #include "libyuv/basic_types.h" @@ -24,151 +24,237 @@ extern "C" { // I420Copy in convert to I420ToI420. LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI422(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI444(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I400Copy(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height); LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int I420ToNV12(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height); LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int I420ToNV21(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_vu, + int dst_stride_vu, + int width, + int height); LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToYUY2(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToUYVY(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToBGRA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int I420ToRGBA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height); LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRGB24(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRAW(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); -//LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +LIBYUV_API +int I420ToRGB565(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - const uint8* dither4x4, int width, int height); +int I420ToRGB565Dither(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + const uint8* dither4x4, + int width, + int height); LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB1555(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB4444(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, +int ConvertFromI420(const uint8* y, + int y_stride, + const uint8* u, + int u_stride, + const uint8* v, + int v_stride, + uint8* dst_sample, + int dst_sample_stride, + int width, + int height, uint32 format); #ifdef __cplusplus @@ -176,4 +262,4 @@ int ConvertFromI420(const uint8* y, int y_stride, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h index 1df53200..50722d76 100644 --- a/files/include/libyuv/convert_from_argb.h +++ b/files/include/libyuv/convert_from_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,45 +21,66 @@ extern "C" { // Copy ARGB to ARGB. #define ARGBToARGB ARGBCopy LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert ARGB To BGRA. LIBYUV_API -int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); +int ARGBToBGRA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_bgra, + int dst_stride_bgra, + int width, + int height); // Convert ARGB To ABGR. LIBYUV_API -int ARGBToABGR(const uint8* src_argb, int src_stride_argb, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int ARGBToABGR(const uint8* src_argb, + int src_stride_argb, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert ARGB To RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int ARGBToRGBA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height); // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); +int ARGBToRGB24(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height); +int ARGBToRAW(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb, + int dst_stride_rgb, + int width, + int height); // Convert ARGB To RGB565. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); +int ARGBToRGB565(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. @@ -67,124 +88,178 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, // TODO(fbarchard): Consider pointer to 2d array for dither4x4. // const uint8(*dither)[4][4]; LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height); +int ARGBToRGB565Dither(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb565, + int dst_stride_rgb565, + const uint8* dither4x4, + int width, + int height); // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height); +int ARGBToARGB1555(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height); +int ARGBToARGB4444(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); // Convert ARGB To I444. LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI444(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I422. LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI422(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I420. (also in convert.h) LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ420(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J422. LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert ARGB To I411. -LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ422(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J400. (JPeg full range). LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height); +int ARGBToJ400(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + int width, + int height); // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBToI400(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) LIBYUV_API -int ARGBToG(const uint8* src_argb, int src_stride_argb, - uint8* dst_g, int dst_stride_g, - int width, int height); +int ARGBToG(const uint8* src_argb, + int src_stride_argb, + uint8* dst_g, + int dst_stride_g, + int width, + int height); // Convert ARGB To NV12. LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int ARGBToNV12(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height); +int ARGBToYUY2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); // Convert ARGB To UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height); +int ARGBToUYVY(const uint8* src_argb, + int src_stride_argb, + uint8* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h index dfb7445e..bcddb32e 100644 --- a/files/include/libyuv/cpu_id.h +++ b/files/include/libyuv/cpu_id.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ #define INCLUDE_LIBYUV_CPU_ID_H_ #include "libyuv/basic_types.h" @@ -31,17 +31,20 @@ static const int kCpuHasX86 = 0x10; static const int kCpuHasSSE2 = 0x20; static const int kCpuHasSSSE3 = 0x40; static const int kCpuHasSSE41 = 0x80; -static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasSSE42 = 0x100; // unused at this time. static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; static const int kCpuHasAVX3 = 0x2000; -// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. +static const int kCpuHasF16C = 0x4000; + +// 0x8000 reserved for future X86 flags. // These flags are only valid on MIPS processors. static const int kCpuHasMIPS = 0x10000; static const int kCpuHasDSPR2 = 0x20000; +static const int kCpuHasMSA = 0x40000; // Internal function used to auto-init. LIBYUV_API @@ -77,4 +80,4 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/files/include/libyuv/macros_msa.h b/files/include/libyuv/macros_msa.h new file mode 100644 index 00000000..61be352e --- /dev/null +++ b/files/include/libyuv/macros_msa.h @@ -0,0 +1,233 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ +#define INCLUDE_LIBYUV_MACROS_MSA_H_ + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include <msa.h> +#include <stdint.h> + +#if (__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \ + uint32 val_m; \ + asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ + uint64 val_m = 0; \ + asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ + uint32 val0_m, val1_m; \ + uint64 val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64)(val1_m); /* NOLINT */ \ + val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#if (__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint64_t val_m = (val); \ + asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ + : [pdst_sd_m] "=m"(*pdst_sd_m) \ + : [val_m] "r"(val_m)); \ + }) +#else // !(__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // !(__mips == 64) +#else // !(__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \ + uint32 val_m; \ + asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ + uint64 val_m = 0; \ + asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \ + uint32 val0_m, val1_m; \ + uint64 val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64)(val1_m); /* NOLINT */ \ + val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // (__mips_isa_rev >= 6) + +// TODO(fbarchard): Consider removing __VAR_ARGS versions. +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) + +/* Description : Load two vectors with 16 'byte' sized elements + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) + +/* Description : Store two vectors with stride each having 16 'byte' sized + elements + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) + +// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) + +#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ + +#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h index 8423121d..8a4f2822 100644 --- a/files/include/libyuv/mjpeg_decoder.h +++ b/files/include/libyuv/mjpeg_decoder.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ #define INCLUDE_LIBYUV_MJPEG_DECODER_H_ #include "libyuv/basic_types.h" @@ -37,7 +37,6 @@ static const uint32 kUnknownDataSize = 0xFFFFFFFF; enum JpegSubsamplingType { kJpegYuv420, kJpegYuv422, - kJpegYuv411, kJpegYuv444, kJpegYuv400, kJpegUnknown @@ -145,12 +144,16 @@ class LIBYUV_API MJpegDecoder { // callback function. Each call will get the data for a whole number of // image scanlines. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height); + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height); // The helper function which recognizes the jpeg sub-sampling type. static JpegSubsamplingType JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components); + int* subsample_x, + int* subsample_y, + int number_of_components); private: void AllocOutputBuffers(int num_outbufs); @@ -189,4 +192,4 @@ class LIBYUV_API MJpegDecoder { } // namespace libyuv #endif // __cplusplus -#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h index 881b0c5c..040839c2 100644 --- a/files/include/libyuv/planar_functions.h +++ b/files/include/libyuv/planar_functions.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #include "libyuv/basic_types.h" @@ -24,86 +24,164 @@ extern "C" { // Copy a plane of data. LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +void CopyPlane(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height); LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height); +void CopyPlane_16(const uint16* src_y, + int src_stride_y, + uint16* dst_y, + int dst_stride_y, + int width, + int height); // Set a plane of data to a 32 bit value. LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, +void SetPlane(uint8* dst_y, + int dst_stride_y, + int width, + int height, uint32 value); +// Split interleaved UV plane into separate U and V planes. +LIBYUV_API +void SplitUVPlane(const uint8* src_uv, + int src_stride_uv, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); + +// Merge separate U and V planes into one interleaved UV plane. +LIBYUV_API +void MergeUVPlane(const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height); + // Copy I400. Supports inverting. LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I400ToI400(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height); #define J400ToJ400 I400ToI400 // Copy I422 to I422. #define I422ToI422 I422Copy LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I422Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Copy I444 to I444. #define I444ToI444 I444Copy LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I444Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert YUY2 to I422. LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int YUY2ToI422(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Convert UYVY to I422. LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int UYVYToI422(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int YUY2ToNV12(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int UYVYToNV12(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int YUY2ToY(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Convert I420 to I400. (calls CopyPlane ignoring u/v). LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I420ToI400(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Alias #define J420ToJ400 I420ToI400 @@ -111,13 +189,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y, // I420 mirror. LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420Mirror(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Alias #define I400ToI400Mirror I400Mirror @@ -125,87 +210,139 @@ int I420Mirror(const uint8* src_y, int src_stride_y, // I400 mirror. A single plane is mirrored horizontally. // Pass negative height to achieve 180 degree rotation. LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I400Mirror(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Alias #define ARGBToARGBMirror ARGBMirror // ARGB mirror. LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBMirror(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert NV12 to RGB565. LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); +int NV12ToRGB565(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // I422ToARGB is in convert_argb.h // Convert I422 to BGRA. LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); +int I422ToBGRA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_bgra, + int dst_stride_bgra, + int width, + int height); // Convert I422 to ABGR. LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int I422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I422 to RGBA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int I422ToRGBA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height); // Alias #define RGB24ToRAW RAWToRGB24 LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); +int RAWToRGB24(const uint8* src_raw, + int src_stride_raw, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Draw a rectangle into I420. LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, int width, int height, - int value_y, int value_u, int value_v); +int I420Rect(uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v); // Draw a rectangle into ARGB. LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height, uint32 value); +int ARGBRect(uint8* dst_argb, + int dst_stride_argb, + int x, + int y, + int width, + int height, + uint32 value); // Convert ARGB to gray scale ARGB. LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBGrayTo(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Make a rectangle of ARGB gray scale. LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); +int ARGBGray(uint8* dst_argb, + int dst_stride_argb, + int x, + int y, + int width, + int height); // Make a rectangle of ARGB Sepia tone. LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); +int ARGBSepia(uint8* dst_argb, + int dst_stride_argb, + int x, + int y, + int width, + int height); // Apply a matrix rotation to each ARGB pixel. // matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. @@ -214,10 +351,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, // The next 4 coefficients apply to B, G, R, A and produce R of the output. // The last 4 coefficients apply to B, G, R, A and produce A of the output. LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBColorMatrix(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const int8* matrix_argb, - int width, int height); + int width, + int height); // Deprecated. Use ARGBColorMatrix instead. // Apply a matrix rotation to each ARGB pixel. @@ -226,32 +366,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, // The next 4 coefficients apply to B, G, R, A and produce G of the output. // The last 4 coefficients apply to B, G, R, A and produce R of the output. LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, +int RGBColorMatrix(uint8* dst_argb, + int dst_stride_argb, const int8* matrix_rgb, - int x, int y, int width, int height); + int x, + int y, + int width, + int height); // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, +int ARGBColorTable(uint8* dst_argb, + int dst_stride_argb, const uint8* table_argb, - int x, int y, int width, int height); + int x, + int y, + int width, + int height); // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, +int RGBColorTable(uint8* dst_argb, + int dst_stride_argb, const uint8* table_argb, - int x, int y, int width, int height); + int x, + int y, + int width, + int height); // Apply a luma/color table each ARGB pixel but preserve destination alpha. // Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from // RGB (YJ style) and C is an 8 bit color component (R, G or B). LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBLumaColorTable(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const uint8* luma_rgb_table, - int width, int height); + int width, + int height); // Apply a 3 term polynomial to ARGB values. // poly points to a 4x4 matrix. The first row is constants. The 2nd row is @@ -262,46 +417,80 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, // A polynomial approximation can be dirived using software such as 'R'. LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBPolynomial(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const float* poly, - int width, int height); + int width, + int height); + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16* src_y, + int src_stride_y, + uint16* dst_y, + int dst_stride_y, + float scale, + int width, + int height); // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. // interval_offset should be a value between 0 and 255. LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int x, int y, int width, int height); +int ARGBQuantize(uint8* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int x, + int y, + int width, + int height); // Copy ARGB to ARGB. LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Copy Alpha channel of ARGB to alpha of ARGB. LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopyAlpha(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Extract the alpha channel from ARGB. LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_a, int dst_stride_a, - int width, int height); +int ARGBExtractAlpha(const uint8* src_argb, + int src_stride_argb, + uint8* dst_a, + int dst_stride_a, + int width, + int height); // Copy Y channel to Alpha of ARGB. LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopyYToAlpha(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); -typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); +typedef void (*ARGBBlendRow)(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width); // Get function to Alpha Blend ARGB pixels and store to destination. LIBYUV_API @@ -311,92 +500,143 @@ ARGBBlendRow GetARGBBlend(); // Source is pre-multiplied by alpha using ARGBAttenuate. // Alpha of destination is set to 255. LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBBlend(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Alpha Blend plane and store to destination. // Source is not pre-multiplied by alpha. LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height); +int BlendPlane(const uint8* src_y0, + int src_stride_y0, + const uint8* src_y1, + int src_stride_y1, + const uint8* alpha, + int alpha_stride, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Alpha Blend YUV images and store to destination. // Source is not pre-multiplied by alpha. // Alpha is full width x height and subsampled to half size to apply to UV. LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420Blend(const uint8* src_y0, + int src_stride_y0, + const uint8* src_u0, + int src_stride_u0, + const uint8* src_v0, + int src_stride_v0, + const uint8* src_y1, + int src_stride_y1, + const uint8* src_u1, + int src_stride_u1, + const uint8* src_v1, + int src_stride_v1, + const uint8* alpha, + int alpha_stride, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height); // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBMultiply(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Add ARGB image with ARGB image. Saturates to 255. LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBAdd(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSubtract(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I422 to YUY2. LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I422ToYUY2(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); // Convert I422 to UYVY. LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I422ToUYVY(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_frame, + int dst_stride_frame, + int width, + int height); // Convert unattentuated ARGB to preattenuated ARGB. LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBAttenuate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBUnattenuate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Internal function - do not call directly. // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height); +int ARGBComputeCumulativeSum(const uint8* src_argb, + int src_stride_argb, + int32* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height); // Blur ARGB image. // dst_cumsum table of width * (height + 1) * 16 bytes aligned to @@ -405,49 +645,79 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. // Blur is optimized for radius of 5 (11x11) or less. LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius); +int ARGBBlur(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int32* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius); // Multiply ARGB image by ARGB value. LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value); +int ARGBShade(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32 value); // Interpolate between two images using specified amount of interpolation // (0 to 255) and store to destination. // 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 // and 255 means 1% src0 and 99% src1. LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation); +int InterpolatePlane(const uint8* src0, + int src_stride0, + const uint8* src1, + int src_stride1, + uint8* dst, + int dst_stride, + int width, + int height, + int interpolation); // Interpolate between two ARGB images using specified amount of interpolation // Internally calls InterpolatePlane with width * 4 (bpp). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation); +int ARGBInterpolate(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation); // Interpolate between two YUV images using specified amount of interpolation // Internally calls InterpolatePlane on each plane where the U and V planes // are half width and half height. LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation); +int I420Interpolate(const uint8* src0_y, + int src0_stride_y, + const uint8* src0_u, + int src0_stride_u, + const uint8* src0_v, + int src0_stride_v, + const uint8* src1_y, + int src1_stride_y, + const uint8* src1_u, + int src1_stride_u, + const uint8* src1_v, + int src1_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation); #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__i386__) && !defined(__SSE2__)) @@ -468,40 +738,59 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y, // Row function for copying pixels from a source with a slope to a row // of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. // shuffler is 16 bytes and must be aligned. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height); +int ARGBShuffle(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_argb, + int dst_stride_argb, + const uint8* shuffler, + int width, + int height); // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBSobelToPlane(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + int width, + int height); // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobel(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobelXY(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h index 8af60b89..b9f7154a 100644 --- a/files/include/libyuv/rotate.h +++ b/files/include/libyuv/rotate.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_ #include "libyuv/basic_types.h" @@ -20,8 +20,8 @@ extern "C" { // Supported rotation. typedef enum RotationMode { - kRotate0 = 0, // No rotation. - kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. kRotate180 = 180, // Rotate 180 degrees. kRotate270 = 270, // Rotate 270 degrees clockwise. @@ -33,85 +33,132 @@ typedef enum RotationMode { // Rotate I420 frame. LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int I420Rotate(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int src_width, + int src_height, + enum RotationMode mode); // Rotate NV12 input and store in I420. LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int NV12ToI420Rotate(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int src_width, + int src_height, + enum RotationMode mode); // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int src_width, int src_height, enum RotationMode mode); +int RotatePlane(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int src_width, + int src_height, + enum RotationMode mode); // Rotate planes by 90, 180, 270. Deprecated. LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane90(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane180(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane270(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV90(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height); // Rotations for when U and V are interleaved. // These functions take one input pointer and // split the data into two buffers while // rotating them. Deprecated. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV180(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height); LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV270(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height); // The 90 and 270 functions are based on transposes. // Doing a transpose with reversing the read/write // order will result in a rotation by +- 90 degrees. // Deprecated. LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void TransposePlane(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void TransposeUV(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h index 660ff557..be0190c1 100644 --- a/files/include/libyuv/rotate_argb.h +++ b/files/include/libyuv/rotate_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ #define INCLUDE_LIBYUV_ROTATE_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,13 +21,17 @@ extern "C" { // Rotate ARGB frame LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, enum RotationMode mode); +int ARGBRotate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + enum RotationMode mode); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h index ebc487f9..2c51584e 100644 --- a/files/include/libyuv/rotate_row.h +++ b/files/include/libyuv/rotate_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ #define INCLUDE_LIBYUV_ROTATE_ROW_H_ #include "libyuv/basic_types.h" @@ -36,7 +36,8 @@ extern "C" { // The following are available for GCC 32 or 64 bit but not NaCL for 64 bit: #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) + (defined(__i386__) || \ + (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSEWX8_SSSE3 #endif @@ -53,69 +54,175 @@ extern "C" { #define HAS_TRANSPOSEUVWX8_NEON #endif -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \ + defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_TRANSPOSEWX8_DSPR2 #define HAS_TRANSPOSEUVWX8_DSPR2 #endif // defined(__mips__) -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height); - -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); - -void TransposeWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); - -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); - -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); - -void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_TRANSPOSEWX16_MSA +#define HAS_TRANSPOSEUVWX16_MSA +#endif + +void TransposeWxH_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height); + +void TransposeWx8_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx16_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_NEON(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_DSPR2(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_DSPR2(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx16_MSA(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); + +void TransposeWx8_Any_NEON(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_Any_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx8_Any_DSPR2(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); +void TransposeWx16_Any_MSA(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width); + +void TransposeUVWxH_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height); + +void TransposeUVWx8_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_SSE2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_NEON(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_DSPR2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_MSA(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); + +void TransposeUVWx8_Any_SSE2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_Any_NEON(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_Any_DSPR2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_Any_MSA(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h index 055880ba..3e5dd200 100644 --- a/files/include/libyuv/row.h +++ b/files/include/libyuv/row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROW_H_ #define INCLUDE_LIBYUV_ROW_H_ #include <stdlib.h> // For malloc. @@ -20,21 +20,14 @@ namespace libyuv { extern "C" { #endif -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) -#ifdef __cplusplus -#define align_buffer_64(var, size) \ - uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63)); \ - uint8* var = reinterpret_cast<uint8*> \ - ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63) -#else -#define align_buffer_64(var, size) \ - uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ - uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ -#endif +#define align_buffer_64(var, size) \ + uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ + uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ #define free_aligned_buffer_64(var) \ - free(var##_mem); \ + free(var##_mem); \ var = 0 #if defined(__pnacl__) || defined(__CLR_VER) || \ @@ -77,8 +70,8 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -90,6 +83,7 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 @@ -104,12 +98,12 @@ extern "C" { #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 +#define HAS_HALFFLOATROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -180,11 +174,8 @@ extern "C" { // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. -#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \ - !defined(__i386__) || defined(_MSC_VER) -// TODO(fbarchard): fix build error on x86 debug -// https://code.google.com/p/libyuv/issues/detail?id=524 -#define HAS_I411TOARGBROW_SSSE3 +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_SSSE3 @@ -194,10 +185,12 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBEXTRACTALPHAROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 @@ -208,13 +201,9 @@ extern "C" { #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I400TOARGBROW_AVX2 -#if !(defined(_DEBUG) && defined(__i386__)) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#endif -#define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -246,6 +235,13 @@ extern "C" { #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 + +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 +#endif #endif // The following are available for AVX2 Visual C and clangcl 32 bit: @@ -279,6 +275,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON @@ -286,18 +283,16 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON -#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON +#define HAS_HALFFLOATROW_NEON #define HAS_I400TOARGBROW_NEON -#define HAS_I411TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON @@ -360,7 +355,7 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ +#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) @@ -369,12 +364,101 @@ extern "C" { #define HAS_MIRRORROW_DSPR2 #define HAS_MIRRORUVROW_DSPR2 #define HAS_SPLITUVROW_DSPR2 +#define HAS_RGB24TOARGBROW_DSPR2 +#define HAS_RAWTOARGBROW_DSPR2 +#define HAS_RGB565TOARGBROW_DSPR2 +#define HAS_ARGB1555TOARGBROW_DSPR2 +#define HAS_ARGB4444TOARGBROW_DSPR2 +#define HAS_I444TOARGBROW_DSPR2 +#define HAS_I422TOARGB4444ROW_DSPR2 +#define HAS_I422TOARGB1555ROW_DSPR2 +#define HAS_NV12TOARGBROW_DSPR2 +#define HAS_BGRATOUVROW_DSPR2 +#define HAS_BGRATOYROW_DSPR2 +#define HAS_ABGRTOUVROW_DSPR2 +#define HAS_ARGBTOYROW_DSPR2 +#define HAS_ABGRTOYROW_DSPR2 +#define HAS_RGBATOUVROW_DSPR2 +#define HAS_RGBATOYROW_DSPR2 +#define HAS_ARGBTOUVROW_DSPR2 #endif #endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ARGBMIRRORROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_MIRRORROW_MSA +#define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA +#define HAS_ARGB4444TOARGBROW_MSA +#define HAS_ARGBTOYROW_MSA +#define HAS_ARGBTOUVROW_MSA +#define HAS_I422TOARGBROW_MSA +#define HAS_I422TORGBAROW_MSA +#define HAS_I422ALPHATOARGBROW_MSA +#define HAS_I422TORGB24ROW_MSA +#define HAS_ARGBTORGB24ROW_MSA +#define HAS_ARGBTORAWROW_MSA +#define HAS_ARGBTORGB565ROW_MSA +#define HAS_ARGBTOARGB1555ROW_MSA +#define HAS_ARGBTOARGB4444ROW_MSA +#define HAS_ARGBTOUV444ROW_MSA +#define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBADDROW_MSA +#define HAS_ARGBSUBTRACTROW_MSA +#define HAS_ARGBATTENUATEROW_MSA +#define HAS_ARGBTORGB565DITHERROW_MSA +#define HAS_ARGBSHUFFLEROW_MSA +#define HAS_ARGBSHADEROW_MSA +#define HAS_ARGBGRAYROW_MSA +#define HAS_ARGBSEPIAROW_MSA +#define HAS_ARGB1555TOARGBROW_MSA +#define HAS_RGB565TOARGBROW_MSA +#define HAS_RGB24TOARGBROW_MSA +#define HAS_RAWTOARGBROW_MSA +#define HAS_ARGB1555TOYROW_MSA +#define HAS_RGB565TOYROW_MSA +#define HAS_RGB24TOYROW_MSA +#define HAS_RAWTOYROW_MSA +#define HAS_ARGB1555TOUVROW_MSA +#define HAS_RGB565TOUVROW_MSA +#define HAS_RGB24TOUVROW_MSA +#define HAS_RAWTOUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA +#define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXYROW_MSA +#define HAS_ARGBTOYJROW_MSA +#define HAS_BGRATOYROW_MSA +#define HAS_ABGRTOYROW_MSA +#define HAS_RGBATOYROW_MSA +#define HAS_ARGBTOUVJROW_MSA +#define HAS_BGRATOUVROW_MSA +#define HAS_ABGRTOUVROW_MSA +#define HAS_RGBATOUVROW_MSA +#define HAS_I444TOARGBROW_MSA +#define HAS_I400TOARGBROW_MSA +#define HAS_J400TOARGBROW_MSA +#define HAS_YUY2TOARGBROW_MSA +#define HAS_UYVYTOARGBROW_MSA +#define HAS_INTERPOLATEROW_MSA +#define HAS_ARGBSETROW_MSA +#define HAS_RAWTORGB24ROW_MSA +#define HAS_MERGEUVROW_MSA +#endif + #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) +#if defined(VISUALC_HAS_AVX2) +#define SIMD_ALIGNED(var) __declspec(align(32)) var +#else #define SIMD_ALIGNED(var) __declspec(align(16)) var -#define SIMD_ALIGNED32(var) __declspec(align(64)) var +#endif typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) int8 vec8[16]; @@ -389,8 +473,11 @@ typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) +#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) +#else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#define SIMD_ALIGNED32(var) var __attribute__((aligned(64))) +#endif typedef int16 __attribute__((vector_size(16))) vec16; typedef int32 __attribute__((vector_size(16))) vec32; typedef int8 __attribute__((vector_size(16))) vec8; @@ -405,7 +492,6 @@ typedef uint32 __attribute__((vector_size(32))) ulvec32; typedef uint8 __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var -#define SIMD_ALIGNED32(var) var typedef int16 vec16[8]; typedef int32 vec32[4]; typedef int8 vec8[16]; @@ -441,34 +527,34 @@ struct YuvConstants { #else // This struct is for Intel color conversion. struct YuvConstants { - lvec8 kUVToB; - lvec8 kUVToG; - lvec8 kUVToR; - lvec16 kUVBiasB; - lvec16 kUVBiasG; - lvec16 kUVBiasR; - lvec16 kYToRgb; + int8 kUVToB[32]; + int8 kUVToG[32]; + int8 kUVToR[32]; + int16 kUVBiasB[16]; + int16 kUVBiasG[16]; + int16 kUVBiasR[16]; + int16 kYToRgb[16]; }; // Offsets into YuvConstants structure -#define KUVTOB 0 -#define KUVTOG 32 -#define KUVTOR 64 +#define KUVTOB 0 +#define KUVTOG 32 +#define KUVTOR 64 #define KUVBIASB 96 #define KUVBIASG 128 #define KUVBIASR 160 -#define KYTORGB 192 +#define KYTORGB 192 #endif // Conversion matrix for YUV to RGB -extern const struct YuvConstants kYuvI601Constants; // BT.601 -extern const struct YuvConstants kYuvJPEGConstants; // JPeg color space -extern const struct YuvConstants kYuvH709Constants; // BT.709 +extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants); // BT.601 +extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants); // JPeg +extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants); // BT.709 // Conversion matrix for YVU to BGR -extern const struct YuvConstants kYvuI601Constants; // BT.601 -extern const struct YuvConstants kYvuJPEGConstants; // JPeg color space -extern const struct YuvConstants kYvuH709Constants; // BT.709 +extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601 +extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg +extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP @@ -490,60 +576,53 @@ extern const struct YuvConstants kYvuH709Constants; // BT.709 #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" #define MEMLEA(offset, base) #offset "(%q" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%q" #index "," #scale ")" +#define MEMLEA3(offset, index, scale) #offset "(,%q" #index "," #scale ")" #define MEMLEA4(offset, base, index, scale) \ - #offset "(%q" #base ",%q" #index "," #scale ")" + #offset "(%q" #base ",%q" #index "," #scale ")" #define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" #define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg "\n" \ - BUNDLEUNLOCK -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " %%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK -#define MEMOPARG(opcode, offset, base, index, scale, arg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%" #arg "\n" \ - BUNDLEUNLOCK -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \ - BUNDLEUNLOCK -#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \ + " (%%r15,%%r14),%%" #reg "\n" BUNDLEUNLOCK +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \ + " %%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \ + " (%%r15,%%r14),%" #arg "\n" BUNDLEUNLOCK +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \ + " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" BUNDLEUNLOCK +#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #op \ + " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK #else // defined(__native_client__) && defined(__x86_64__) #define NACL_R14 #define BUNDLEALIGN #define MEMACCESS(base) "(%" #base ")" #define MEMACCESS2(offset, base) #offset "(%" #base ")" #define MEMLEA(offset, base) #offset "(%" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%" #index "," #scale ")" +#define MEMLEA3(offset, index, scale) #offset "(,%" #index "," #scale ")" #define MEMLEA4(offset, base, index, scale) \ - #offset "(%" #base ",%" #index "," #scale ")" + #offset "(%" #base ",%" #index "," #scale ")" #define MEMMOVESTRING(s, d) #define MEMSTORESTRING(reg, d) #define MEMOPREG(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" #define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" + #opcode " %%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #define MEMOPARG(opcode, offset, base, index, scale, arg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \ - #reg2 "\n" + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 \ + ",%%" #reg2 "\n" #define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" + #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #endif // defined(__native_client__) && defined(__x86_64__) #if defined(__arm__) || defined(__aarch64__) @@ -555,6 +634,57 @@ extern const struct YuvConstants kYvuH709Constants; // BT.709 #endif #endif +// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be +// measured and then run with iaca -64 libyuv_unittest. +// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within +// inline assembly blocks. +// example of iaca: +// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest + +#if defined(__x86_64__) || defined(__i386__) + +#define IACA_ASM_START \ + ".byte 0x0F, 0x0B\n" \ + " movl $111, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" + +#define IACA_ASM_END \ + " movl $222, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" \ + ".byte 0x0F, 0x0B\n" + +#define IACA_SSC_MARK(MARK_ID) \ + __asm__ __volatile__("\n\t movl $" #MARK_ID \ + ", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : \ + : \ + : "memory"); + +#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); + +#else /* Visual C */ +#define IACA_UD_BYTES \ + { __asm _emit 0x0F __asm _emit 0x0B } + +#define IACA_SSC_MARK(x) \ + { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); +#endif + +#define IACA_START \ + { \ + IACA_UD_BYTES \ + IACA_SSC_MARK(111) \ + } +#define IACA_END \ + { \ + IACA_SSC_MARK(222) \ + IACA_UD_BYTES \ + } + void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -580,12 +710,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -639,6 +763,102 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); + +void I422ToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_MSA(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_MSA(const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_MSA(const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); @@ -653,30 +873,111 @@ void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width); void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToYRow_MSA(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYJRow_MSA(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToUV444Row_NEON(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, int width); -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUVRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUV444Row_MSA(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_MSA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_NEON(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_NEON(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_NEON(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB24ToUVRow_NEON(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width); +void RAWToUVRow_NEON(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB565ToUVRow_NEON(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_MSA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_MSA(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_MSA(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_MSA(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB24ToUVRow_MSA(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width); +void RAWToUVRow_MSA(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB565ToUVRow_MSA(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, int width); -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width); void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width); void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width); @@ -685,6 +986,37 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width); void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width); void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width); void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width); +void BGRAToYRow_MSA(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_MSA(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_MSA(const uint8* src_rgba, uint8* dst_y, int width); +void RGB24ToYRow_MSA(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_MSA(const uint8* src_raw, uint8* dst_y, int width); +void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width); +void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width); +void BGRAToUVRow_DSPR2(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToYRow_DSPR2(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToUVRow_DSPR2(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToYRow_DSPR2(const uint8* src_argb, uint8* dst_y, int width); +void ABGRToYRow_DSPR2(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToUVRow_DSPR2(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToYRow_DSPR2(const uint8* src_rgba, uint8* dst_y, int width); +void ARGBToUVRow_DSPR2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width); void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width); @@ -710,154 +1042,400 @@ void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width); void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width); void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width); void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, +void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, + uint8* dst_y, int width); -void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, +void BGRAToYRow_Any_DSPR2(const uint8* src_bgra, uint8* dst_y, int width); +void ARGBToYRow_Any_DSPR2(const uint8* src_argb, uint8* dst_y, int width); +void ABGRToYRow_Any_DSPR2(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width); +void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, + uint8* dst_y, int width); - -void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width); -void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void BGRAToYRow_Any_MSA(const uint8* src_bgra, uint8* dst_y, int width); +void ABGRToYRow_Any_MSA(const uint8* src_abgr, uint8* dst_y, int width); +void RGBAToYRow_Any_MSA(const uint8* src_rgba, uint8* dst_y, int width); +void ARGBToYJRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width); +void ARGBToYRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width); +void RGB24ToYRow_Any_MSA(const uint8* src_rgb24, uint8* dst_y, int width); +void RAWToYRow_Any_MSA(const uint8* src_raw, uint8* dst_y, int width); +void RGB565ToYRow_Any_MSA(const uint8* src_rgb565, uint8* dst_y, int width); +void ARGB1555ToYRow_Any_MSA(const uint8* src_argb1555, uint8* dst_y, int width); + +void ARGBToUVRow_AVX2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_AVX2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_SSSE3(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_SSSE3(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_SSSE3(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_SSSE3(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_SSSE3(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUV444Row_Any_NEON(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, int width); -void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUV444Row_Any_MSA(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_Any_MSA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_Any_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_Any_NEON(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_Any_NEON(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_Any_NEON(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width); +void RAWToUVRow_Any_NEON(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width); void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_Any_MSA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_Any_MSA(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_Any_MSA(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_Any_MSA(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB24ToUVRow_Any_MSA(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width); +void RAWToUVRow_Any_MSA(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB565ToUVRow_Any_MSA(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB1555ToUVRow_Any_MSA(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_Any_DSPR2(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_Any_DSPR2(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_Any_DSPR2(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_Any_DSPR2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_C(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_C(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVRow_C(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGBToUVJRow_C(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width); +void BGRAToUVRow_C(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width); +void ABGRToUVRow_C(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width); +void RGBAToUVRow_C(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB24ToUVRow_C(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width); +void RAWToUVRow_C(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width); +void RGB565ToUVRow_C(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB1555ToUVRow_C(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width); +void ARGB4444ToUVRow_C(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, + uint8* dst_v, + int width); void ARGBToUV444Row_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRow_MSA(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_MSA(const uint8* src, uint8* dst, int width); -void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_SSSE3(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_DSPR2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_MSA(const uint8* src, uint8* dst, int width); void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_SSE2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_AVX2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_DSPR2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_Any_SSE2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_Any_AVX2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_Any_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_Any_DSPR2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width); -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_C(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_SSE2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_AVX2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_MSA(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, + int width); +void MergeUVRow_Any_SSE2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_Any_AVX2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); -void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_Any_NEON(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width); +void MergeUVRow_Any_MSA(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, + int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); @@ -874,25 +1452,35 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count); void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, + uint8* dst_argb, int width); -void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, + uint8* dst_argb, int width); void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width); void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width); void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, + uint8* dst_a, int width); -void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb, + uint8* dst_a, + int width); +void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, + uint8* dst_a, int width); void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, +void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, + uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, +void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, + uint8* dst_argb, int width); void SetRow_C(uint8* dst, uint8 v8, int count); @@ -906,83 +1494,173 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_Any_MSA(uint8* dst_argb, uint32 v32, int count); // ARGBShufflers for BGRAToARGB etc. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); +void ARGBShuffleRow_C(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_MSA(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_Any_NEON(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); +void ARGBShuffleRow_Any_MSA(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width); void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width); void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, + uint8* dst_argb, int width); void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, + uint8* dst_argb, int width); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width); +void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width); void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); +void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width); void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width); +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, + uint8* dst_argb, + int width); +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, + uint8* dst_argb, int width); +void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width); +void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width); +void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb, int width); +void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555, + uint8* dst_argb, + int width); +void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444, + uint8* dst_argb, + int width); +void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, + uint8* dst_argb, + int width); void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width); void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width); void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width); void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width); void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, +void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, + uint8* dst_argb, int width); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, +void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, + uint8* dst_argb, int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, + uint8* dst_argb, int width); -void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, +void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, + uint8* dst_argb, int width); -void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, + uint8* dst_argb, int width); -void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, +void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, + uint8* dst_argb, int width); +void RGB24ToARGBRow_Any_MSA(const uint8* src_rgb24, uint8* dst_argb, int width); void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width); +void RAWToARGBRow_Any_MSA(const uint8* src_raw, uint8* dst_argb, int width); void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, +void RAWToRGB24Row_Any_MSA(const uint8* src_raw, uint8* dst_rgb24, int width); +void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, + uint8* dst_argb, int width); -void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, +void RGB565ToARGBRow_Any_MSA(const uint8* src_rgb565, + uint8* dst_argb, + int width); +void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, + uint8* dst_argb, int width); -void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB1555ToARGBRow_Any_MSA(const uint8* src_argb1555, + uint8* dst_argb, + int width); +void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, + uint8* dst_argb, int width); +void RGB24ToARGBRow_Any_DSPR2(const uint8* src_rgb24, + uint8* dst_argb, + int width); +void RAWToARGBRow_Any_DSPR2(const uint8* src_raw, uint8* dst_argb, int width); +void RGB565ToARGBRow_Any_DSPR2(const uint8* src_rgb565, + uint8* dst_argb, + int width); +void ARGB1555ToARGBRow_Any_DSPR2(const uint8* src_argb1555, + uint8* dst_argb, + int width); +void ARGB4444ToARGBRow_Any_DSPR2(const uint8* src_argb4444, + uint8* dst_argb, + int width); + +void ARGB4444ToARGBRow_Any_MSA(const uint8* src_argb4444, + uint8* dst_argb, + int width); void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); @@ -990,12 +1668,18 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565DitherRow_C(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); @@ -1006,8 +1690,19 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); +void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width); @@ -1019,10 +1714,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width); void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, @@ -1049,12 +1746,6 @@ void I422AlphaToARGBRow_C(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_C(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1176,18 +1867,6 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I411ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1332,18 +2011,6 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I411ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1449,108 +2116,222 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBBlendRow_SSSE3(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBBlendRow_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBBlendRow_C(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); // Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_SSSE3(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width); +void BlendPlaneRow_Any_SSSE3(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width); +void BlendPlaneRow_AVX2(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width); +void BlendPlaneRow_Any_AVX2(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width); +void BlendPlaneRow_C(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width); // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBMultiplyRow_C(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); // ARGB add images. -void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBAddRow_C(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_Any_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_Any_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_Any_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBSubtractRow_C(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_Any_NEON(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, + uint8* dst_rgb, int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, + uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, + uint8* dst_rgb, int width); -void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, + uint8* dst_rgb, int width); void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, + uint8* dst_rgb, int width); -void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, + uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); +void ARGBToRGB24Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB565Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToARGB1555Row_Any_MSA(const uint8* src_argb, + uint8* dst_rgb, + int width); +void ARGBToARGB4444Row_Any_MSA(const uint8* src_argb, + uint8* dst_rgb, + int width); +void ARGBToRGB565DitherRow_Any_MSA(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width); void I444ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, @@ -1571,12 +2352,6 @@ void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGBARow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1630,175 +2405,445 @@ void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToARGBRow_Any_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_DSPR2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_Any_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I411ToARGBRow_Any_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_DSPR2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_DSPR2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); +void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width); +void YUY2ToUVRow_MSA(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); +void YUY2ToUV422Row_MSA(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_C(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); +void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width); +void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); +void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_NEON(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); +void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width); +void UYVYToUVRow_MSA(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); +void UYVYToUV422Row_MSA(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_C(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, + uint8* dst_v, + int width); +void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width); +void UYVYToUVRow_Any_MSA(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); +void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, + int width); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width); + uint8* dst_yuy2, + int width); void I422ToUYVYRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width); + uint8* dst_uyvy, + int width); void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width); + uint8* dst_yuy2, + int width); void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width); + uint8* dst_uyvy, + int width); void I422ToYUY2Row_Any_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width); + uint8* dst_yuy2, + int width); void I422ToUYVYRow_Any_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width); + uint8* dst_uyvy, + int width); void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width); + uint8* dst_yuy2, + int width); void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width); + uint8* dst_uyvy, + int width); void I422ToYUY2Row_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width); + uint8* dst_yuy2, + int width); void I422ToUYVYRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width); + uint8* dst_uyvy, + int width); +void I422ToYUY2Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, + int width); +void I422ToUYVYRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, + int width); +void I422ToYUY2Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, + int width); +void I422ToUYVYRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, + int width); // Effects related row functions. void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, + uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, + uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, + uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, + uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_MSA(const uint8* src_argb, + uint8* dst_argb, + int width); // Inverse table for unattenuate, shared by C and SSE2. extern const uint32 fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, + uint8* dst_argb, int width); -void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, + uint8* dst_argb, int width); void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width); void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_NEON(uint8* dst_argb, int width); +void ARGBSepiaRow_MSA(uint8* dst_argb, int width); -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); +void ARGBColorMatrixRow_C(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width); void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); @@ -1806,134 +2851,311 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); +void ARGBQuantizeRow_C(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_SSE2(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_NEON(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_C(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value); -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value); -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_NEON(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value); +void ARGBShadeRow_MSA(const uint8* src_argb, + uint8* dst_argb, + int width, + uint32 value); // Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_SSE2(const int32* topleft, + const int32* botleft, + int width, + int area, + uint8* dst, + int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, + int32* cumsum, + const int32* previous_cumsum, + int width); -void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_C(const int32* topleft, + const int32* botleft, + int width, + int area, + uint8* dst, + int count); +void ComputeCumulativeSumRow_C(const uint8* row, + int32* cumsum, + const int32* previous_cumsum, + int width); LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, +void InterpolateRow_C(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, + int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_AVX2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_DSPR2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_MSA(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_AVX2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_DSPR2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); - -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, +void InterpolateRow_Any_MSA(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); + +void InterpolateRow_16_C(uint16* dst_ptr, + const uint16* src_ptr, ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); + int width, + int source_y_fraction); // Sobel images. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width); -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); +void SobelXRow_C(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width); +void SobelXRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width); +void SobelXRow_NEON(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width); +void SobelYRow_C(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width); +void SobelYRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width); +void SobelYRow_NEON(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width); +void SobelRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelToPlaneRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelToPlaneRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelToPlaneRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelXYRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelXYRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelXYRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelXYRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_Any_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_Any_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelToPlaneRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); +void SobelXYRow_Any_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelXYRow_Any_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); +void SobelXYRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, const float* poly, + uint8* dst_argb, + const float* poly, int width); void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, + uint8* dst_argb, + const float* poly, int width); void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, + uint8* dst_argb, + const float* poly, int width); -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +// Scale and convert to half float. +void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_SSE2(const uint16* src, + uint16* dst, + float scale, + int width); +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_AVX2(const uint16* src, + uint16* dst, + float scale, + int width); +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_F16C(const uint16* src, + uint16* dst, + float scale, + int width); +void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_Any_F16C(const uint16* src, + uint16* dst, + float scale, + int width); +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_NEON(const uint16* src, + uint16* dst, + float scale, + int width); +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_Any_NEON(const uint16* src, + uint16* dst, + float scale, + int width); + +void ARGBLumaColorTableRow_C(const uint8* src_argb, + uint8* dst_argb, + int width, + const uint8* luma, + uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); + const uint8* luma, + uint32 lumacoeff); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROW_H_ diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h index 102158d1..6d6b9a85 100644 --- a/files/include/libyuv/scale.h +++ b/files/include/libyuv/scale.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_H_ #define INCLUDE_LIBYUV_SCALE_H_ #include "libyuv/basic_types.h" @@ -20,25 +20,33 @@ extern "C" { // Supported filtering. typedef enum FilterMode { - kFilterNone = 0, // Point sample; Fastest. - kFilterLinear = 1, // Filter horizontally only. + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. kFilterBilinear = 2, // Faster than box, but lower quality scaling down. - kFilterBox = 3 // Highest quality. + kFilterBox = 3 // Highest quality. } FilterModeEnum; // Scale a YUV plane. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8* src, + int src_stride, + int src_width, + int src_height, + uint8* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane_16(const uint16* src, + int src_stride, + int src_width, + int src_height, + uint16* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); // Scales a YUV 4:2:0 image from the src width and height to the @@ -52,42 +60,73 @@ void ScalePlane_16(const uint16* src, int src_stride, // Returns 0 if successful. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16* src_y, + int src_stride_y, + const uint16* src_u, + int src_stride_u, + const uint16* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16* dst_y, + int dst_stride_y, + uint16* dst_u, + int dst_stride_u, + uint16* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); #ifdef __cplusplus // Legacy API. Deprecated. LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8* dst_y, + uint8* dst_u, + uint8* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate); // Legacy API. Deprecated. LIBYUV_API -int ScaleOffset(const uint8* src_i420, int src_width, int src_height, - uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, +int ScaleOffset(const uint8* src_i420, + int src_width, + int src_height, + uint8* dst_i420, + int dst_width, + int dst_height, + int dst_yoffset, LIBYUV_BOOL interpolate); // For testing, allow disabling of specialized scalers. @@ -100,4 +139,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h index b56cf520..3d25e579 100644 --- a/files/include/libyuv/scale_argb.h +++ b/files/include/libyuv/scale_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ #define INCLUDE_LIBYUV_SCALE_ARGB_H_ #include "libyuv/basic_types.h" @@ -20,32 +20,52 @@ extern "C" { #endif LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering); // Clipped scale takes destination rectangle coordinates for clip values. LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +int YUVToARGBScaleClip(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); #ifdef __cplusplus @@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h index df699e6c..edb46cc8 100644 --- a/files/include/libyuv/scale_row.h +++ b/files/include/libyuv/scale_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" @@ -45,8 +45,8 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -72,8 +72,9 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 @@ -94,32 +95,56 @@ extern "C" { #endif // The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ +#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_SCALEROWDOWN2_DSPR2 #define HAS_SCALEROWDOWN4_DSPR2 #define HAS_SCALEROWDOWN34_DSPR2 #define HAS_SCALEROWDOWN38_DSPR2 +#define HAS_SCALEADDROW_DSPR2 +#endif + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_SCALEARGBROWDOWN2_MSA +#define HAS_SCALEARGBROWDOWNEVEN_MSA +#define HAS_SCALEROWDOWN2_MSA +#define HAS_SCALEROWDOWN4_MSA +#define HAS_SCALEROWDOWN38_MSA +#define HAS_SCALEADDROW_MSA #endif // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering); void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_argb, + uint16* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering); // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering); // Divide num by div and return as 16.16 fixed point result. @@ -137,367 +162,768 @@ int FixedDiv1_X86(int num, int div); #endif // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy); - -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int, int); -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int, int); -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); + int* x, + int* y, + int* dx, + int* dy); + +void ScaleRowDown2_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown4_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown4Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown34_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* d, + int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* d, + int dst_width); +void ScaleCols_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); +void ScaleCols_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int, + int); +void ScaleColsUp2_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int, + int); +void ScaleFilterCols_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols64_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols64_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown38_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown38_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); void ScaleRowDown38_3_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + uint16* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, + int dst_width); void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); void ScaleARGBRowDown2_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); + uint8* dst_argb, + int dst_width); void ScaleARGBRowDown2Linear_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8* dst_argb, + int dst_width); void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int, int); -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); + uint8* dst_argb, + int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols64_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int, + int); +void ScaleARGBFilterCols_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols64_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); + +void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); - +void ScaleFilterCols_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); // ARGB Column functions -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); +void ScaleARGBCols_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx); // ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); - -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); + +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8* dst_argb, + int dst_width); void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8* dst_argb, + int dst_width); void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. // Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); - -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); // 32 -> 12 void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); // 32x3 -> 12x1 void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8* dst_ptr, + int dst_width); + +void ScaleRowDown2_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown4_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown34_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); // 32 -> 12 -void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); - -void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); - -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleFilterCols_NEON(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); + +void ScaleFilterCols_Any_NEON(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx); + +void ScaleRowDown2_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown4_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown34_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width); +void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width); +void ScaleRowDown38_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width); +void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width); +void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_DSPR2(const uint8* src_ptr, + uint16* dst_ptr, + int src_width); + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h index ca0c062e..dccc479b 100644 --- a/files/include/libyuv/version.h +++ b/files/include/libyuv/version.h @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1602 +#define LIBYUV_VERSION 1645 -#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT +#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h index ad934e42..f3711c42 100644 --- a/files/include/libyuv/video_common.h +++ b/files/include/libyuv/video_common.h @@ -10,7 +10,7 @@ // Common definitions for video, including fourcc and VideoFormat. -#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ #define INCLUDE_LIBYUV_VIDEO_COMMON_H_ #include "libyuv/basic_types.h" @@ -28,13 +28,13 @@ extern "C" { // Needs to be a macro otherwise the OS X compiler complains when the kFormat* // constants are used in a switch. #ifdef __cplusplus -#define FOURCC(a, b, c, d) ( \ - (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \ - (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24)) +#define FOURCC(a, b, c, d) \ + ((static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \ + (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24)) #else -#define FOURCC(a, b, c, d) ( \ - ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ - ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#define FOURCC(a, b, c, d) \ + (((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ + ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ #endif // Some pages discussing FourCC codes: @@ -49,18 +49,18 @@ extern "C" { // Secondary formats are converted in 2 steps. // Auxilliary formats call primary converters. enum FourCC { - // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated. FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), - // 2 Secondary YUV formats: row biplanar. + // 1 Secondary YUV format: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. @@ -69,7 +69,7 @@ enum FourCC { FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. @@ -137,7 +137,7 @@ enum FourCCBpp { FOURCC_BPP_ABGR = 32, FOURCC_BPP_RGBA = 32, FOURCC_BPP_24BG = 24, - FOURCC_BPP_RAW = 24, + FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, FOURCC_BPP_RGBO = 16, FOURCC_BPP_R444 = 16, @@ -170,7 +170,7 @@ enum FourCCBpp { FOURCC_BPP_CM24 = 24, // Match any fourcc. - FOURCC_BPP_ANY = 0, // 0 means unknown. + FOURCC_BPP_ANY = 0, // 0 means unknown. }; // Converts fourcc aliases into canonical ones. @@ -181,4 +181,4 @@ LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ diff --git a/files/infra/config/OWNERS b/files/infra/config/OWNERS new file mode 100644 index 00000000..02eccd5e --- /dev/null +++ b/files/infra/config/OWNERS @@ -0,0 +1,3 @@ +set noparent +agable@chromium.org +kjellander@chromium.org diff --git a/files/infra/config/README.md b/files/infra/config/README.md new file mode 100644 index 00000000..c036d610 --- /dev/null +++ b/files/infra/config/README.md @@ -0,0 +1 @@ +This directory contains configuration files for infra services. diff --git a/files/infra/config/cq.cfg b/files/infra/config/cq.cfg new file mode 100644 index 00000000..7a0d2d84 --- /dev/null +++ b/files/infra/config/cq.cfg @@ -0,0 +1,61 @@ +# Commit Queue configuration file. The documentation of the format can be found +# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg. + +version: 1 +cq_name: "libyuv" +cq_status_url: "https://chromium-cq-status.appspot.com" +git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git" + +gerrit {} +rietveld { + url: "https://codereview.chromium.org" +} + + +verifiers { + reviewer_lgtm { + committer_list: "project-libyuv-committers" + } + + try_job { + buckets { + name: "master.tryserver.libyuv" + builders { name: "win" } + builders { name: "win_rel" } + builders { name: "win_x64_rel" } + builders { name: "win_clang" } + builders { name: "win_clang_rel" } + builders { name: "win_x64_clang_rel" } + builders { name: "mac" } + builders { name: "mac_rel" } + builders { name: "mac_asan" } + builders { name: "ios" } + builders { name: "ios_rel" } + builders { name: "ios_arm64" } + builders { name: "ios_arm64_rel" } + builders { name: "linux" } + builders { name: "linux_rel" } + builders { + name: "linux_gcc" + experiment_percentage: 100 + } + builders { name: "linux_memcheck" } + builders { name: "linux_msan" } + builders { name: "linux_tsan2" } + builders { name: "linux_asan" } + builders { name: "linux_msan" } + builders { name: "linux_ubsan" } + builders { name: "linux_ubsan_vptr" } + builders { name: "android" } + builders { name: "android_rel" } + builders { name: "android_clang" } + builders { name: "android_arm64" } + builders { name: "android_x86" } + builders { name: "android_x64" } + builders { + name: "android_mips" + experiment_percentage: 100 + } + } + } +} diff --git a/files/libyuv.gni b/files/libyuv.gni new file mode 100644 index 00000000..89e4d382 --- /dev/null +++ b/files/libyuv.gni @@ -0,0 +1,20 @@ +# Copyright 2016 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +import("//build_overrides/build.gni") +import("//build/config/arm.gni") +import("//build/config/mips.gni") + +declare_args() { + libyuv_include_tests = !build_with_chromium + libyuv_disable_jpeg = false + libyuv_use_neon = (current_cpu == "arm64" || + (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))) + libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") && + mips_use_msa +} diff --git a/files/libyuv.gyp b/files/libyuv.gyp index db4b5490..f73a1a4b 100644 --- a/files/libyuv.gyp +++ b/files/libyuv.gyp @@ -18,21 +18,28 @@ }, 'variables': { 'use_system_libjpeg%': 0, - 'libyuv_disable_jpeg%': 0, + # Can be enabled if your jpeg has GYP support. + 'libyuv_disable_jpeg%': 1, # 'chromium_code' treats libyuv as internal and increases warning level. 'chromium_code': 1, # clang compiler default variable usable by other apps that include libyuv. 'clang%': 0, # Link-Time Optimizations. 'use_lto%': 0, + 'mips_msa%': 0, # Default to msa off. 'build_neon': 0, + 'build_msa': 0, 'conditions': [ ['(target_arch == "armv7" or target_arch == "armv7s" or \ (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ - and (arm_neon == 1 or arm_neon_optional == 1)', - { + and (arm_neon == 1 or arm_neon_optional == 1)', { 'build_neon': 1, }], + ['(target_arch == "mipsel" or target_arch == "mips64el")\ + and (mips_msa == 1)', + { + 'build_msa': 1, + }], ], }, @@ -80,6 +87,11 @@ }], ], }], + ['build_msa != 0', { + 'defines': [ + 'LIBYUV_MSA', + ], + }], ['OS != "ios" and libyuv_disable_jpeg != 1', { 'defines': [ 'HAVE_JPEG' @@ -109,7 +121,7 @@ # Enable the following 3 macros to turn off assembly for specified CPU. # 'LIBYUV_DISABLE_X86', # 'LIBYUV_DISABLE_NEON', - # 'LIBYUV_DISABLE_MIPS', + # 'LIBYUV_DISABLE_DSPR2', # Enable the following macro to build libyuv as a shared library (dll). # 'LIBYUV_USING_SHARED_LIBRARY', # TODO(fbarchard): Make these into gyp defines. diff --git a/files/libyuv.gypi b/files/libyuv.gypi index 73fdec0a..18b2feca 100644 --- a/files/libyuv.gypi +++ b/files/libyuv.gypi @@ -18,6 +18,7 @@ 'include/libyuv/convert_from.h', 'include/libyuv/convert_from_argb.h', 'include/libyuv/cpu_id.h', + 'include/libyuv/macros_msa.h', 'include/libyuv/mjpeg_decoder.h', 'include/libyuv/planar_functions.h', 'include/libyuv/rotate.h', @@ -53,14 +54,16 @@ 'source/rotate_argb.cc', 'source/rotate_common.cc', 'source/rotate_gcc.cc', - 'source/rotate_mips.cc', + 'source/rotate_dspr2.cc', + 'source/rotate_msa.cc', 'source/rotate_neon.cc', 'source/rotate_neon64.cc', 'source/rotate_win.cc', 'source/row_any.cc', 'source/row_common.cc', 'source/row_gcc.cc', - 'source/row_mips.cc', + 'source/row_dspr2.cc', + 'source/row_msa.cc', 'source/row_neon.cc', 'source/row_neon64.cc', 'source/row_win.cc', @@ -69,7 +72,8 @@ 'source/scale_argb.cc', 'source/scale_common.cc', 'source/scale_gcc.cc', - 'source/scale_mips.cc', + 'source/scale_dspr2.cc', + 'source/scale_msa.cc', 'source/scale_neon.cc', 'source/scale_neon64.cc', 'source/scale_win.cc', diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp index 27b330f6..88860f5c 100644 --- a/files/libyuv_test.gyp +++ b/files/libyuv_test.gyp @@ -8,7 +8,9 @@ { 'variables': { - 'libyuv_disable_jpeg%': 0, + # Can be enabled if your jpeg has GYP support. + 'libyuv_disable_jpeg%': 1, + 'mips_msa%': 0, # Default to msa off. }, 'targets': [ { @@ -52,11 +54,6 @@ '-fexceptions', ], }], - [ 'OS == "ios" and target_subarch == 64', { - 'defines': [ - 'LIBYUV_DISABLE_NEON' - ], - }], [ 'OS == "ios"', { 'xcode_settings': { 'DEBUGGING_SYMBOLS': 'YES', @@ -91,12 +88,18 @@ 'LIBYUV_NEON' ], }], + [ '(target_arch == "mipsel" or target_arch == "mips64el") \ + and (mips_msa == 1)', { + 'defines': [ + 'LIBYUV_MSA' + ], + }], ], # conditions 'defines': [ # Enable the following 3 macros to turn off assembly for specified CPU. # 'LIBYUV_DISABLE_X86', # 'LIBYUV_DISABLE_NEON', - # 'LIBYUV_DISABLE_MIPS', + # 'LIBYUV_DISABLE_DSPR2', # Enable the following macro to build libyuv as a shared library (dll). # 'LIBYUV_USING_SHARED_LIBRARY', ], @@ -151,12 +154,6 @@ 'libyuv.gyp:libyuv', ], 'conditions': [ - [ 'OS == "ios" and target_subarch == 64', { - 'defines': [ - 'LIBYUV_DISABLE_NEON' - ], - }], - [ 'OS != "ios" and libyuv_disable_jpeg != 1', { 'defines': [ 'HAVE_JPEG', @@ -181,40 +178,16 @@ ['OS=="android"', { 'targets': [ { - # TODO(kjellander): Figure out what to change in build/apk_test.gypi - # to it can be used instead of the copied code below. Using it in its - # current version was not possible, since the target starts with 'lib', - # which somewhere confuses the variables. - 'target_name': 'libyuv_unittest_apk', + 'target_name': 'yuv_unittest_apk', 'type': 'none', 'variables': { - # These are used to configure java_apk.gypi included below. - 'test_type': 'gtest', - 'apk_name': 'libyuv_unittest', - 'test_suite_name': 'libyuv_unittest', - 'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk', - 'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)', - 'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk', - 'java_in_dir': '<(DEPTH)/testing/android/native_test/java', - 'test_runner_path': '<(DEPTH)/util/android/test_runner.py', - 'native_lib_target': 'libyuv_unittest', - 'gyp_managed_install': 0, + 'test_suite_name': 'yuv_unittest', + 'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)', }, 'includes': [ - 'build/android/test_runner.gypi', - 'build/java_apk.gypi', - ], + 'build/apk_test.gypi', + ], 'dependencies': [ - '<(DEPTH)/base/base.gyp:base_java', - # TODO(kjellander): Figure out why base_build_config_gen is needed - # here. It really shouldn't since it's a dependency of base_java - # above, but there's always 0 tests run if it's missing. - '<(DEPTH)/base/base.gyp:base_build_config_gen', - '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands', - '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk', - '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java', - '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java', - '<(DEPTH)/tools/android/android_tools.gyp:android_tools', 'libyuv_unittest', ], }, diff --git a/files/linux.mk b/files/linux.mk index ee5a3a70..923345ae 100644 --- a/files/linux.mk +++ b/files/linux.mk @@ -32,14 +32,14 @@ LOCAL_OBJ_FILES := \ source/rotate.o \ source/rotate_common.o \ source/rotate_gcc.o \ - source/rotate_mips.o \ + source/rotate_dspr2.o \ source/rotate_neon64.o \ source/rotate_neon.o \ source/rotate_win.o \ source/row_any.o \ source/row_common.o \ source/row_gcc.o \ - source/row_mips.o \ + source/row_dspr2.o \ source/row_neon64.o \ source/row_neon.o \ source/row_win.o \ @@ -48,7 +48,7 @@ LOCAL_OBJ_FILES := \ source/scale.o \ source/scale_common.o \ source/scale_gcc.o \ - source/scale_mips.o \ + source/scale_dspr2.o \ source/scale_neon64.o \ source/scale_neon.o \ source/scale_win.o \ @@ -74,6 +74,8 @@ psnr: util/psnr.cc $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc # A C test utility that uses libyuv conversion from C. +# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0 +# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk cpuid: util/cpuid.c libyuv.a $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a diff --git a/files/pylintrc b/files/pylintrc new file mode 100644 index 00000000..b8bea334 --- /dev/null +++ b/files/pylintrc @@ -0,0 +1,17 @@ +[MESSAGES CONTROL] + +# Disable the message, report, category or checker with the given id(s). +# TODO(kjellander): Reduce this list to as small as possible. +disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements + + +[REPORTS] + +# Don't write out full reports, just messages. +reports=no + + +[FORMAT] + +# We use two spaces for indents, instead of the usual four spaces or tab. +indent-string=' ' diff --git a/files/source/compare.cc b/files/source/compare.cc index e3846bdf..1facd27b 100644 --- a/files/source/compare.cc +++ b/files/source/compare.cc @@ -32,8 +32,7 @@ LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = - HashDjb2_C; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { HashDjb2_SSE = HashDjb2_SSE41; @@ -50,13 +49,13 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { src += kBlockSize; count -= kBlockSize; } - remainder = (int)(count) & ~15; + remainder = (int)count & ~15; if (remainder) { seed = HashDjb2_SSE(src, remainder, seed); src += remainder; count -= remainder; } - remainder = (int)(count) & 15; + remainder = (int)count & 15; if (remainder) { seed = HashDjb2_C(src, remainder, seed); } @@ -113,7 +112,8 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { // TODO(fbarchard): Refactor into row function. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, +uint64 ComputeSumSquareError(const uint8* src_a, + const uint8* src_b, int count) { // SumSquareError returns values 0 to 65535 for each squared difference. // Up to 65536 of those can be summed and remain within a uint32. @@ -142,7 +142,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #ifdef _OPENMP -#pragma omp parallel for reduction(+: sse) +#pragma omp parallel for reduction(+ : sse) #endif for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); @@ -162,14 +162,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height) { uint64 sse = 0; int h; // Coalesce rows. - if (stride_a == width && - stride_b == width) { + if (stride_a == width && stride_b == width) { width *= height; height = 1; stride_a = stride_b = 0; @@ -186,10 +188,10 @@ LIBYUV_API double SumSquareErrorToPsnr(uint64 sse, uint64 count) { double psnr; if (sse > 0) { - double mse = (double)(count) / (double)(sse); + double mse = (double)count / (double)sse; psnr = 10.0 * log10(255.0 * 255.0 * mse); } else { - psnr = kMaxPsnr; // Limit to prevent divide by 0 + psnr = kMaxPsnr; // Limit to prevent divide by 0 } if (psnr > kMaxPsnr) @@ -199,45 +201,53 @@ double SumSquareErrorToPsnr(uint64 sse, uint64 count) { } LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { +double CalcFramePsnr(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height) { const uint64 samples = width * height; - const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, - src_b, stride_b, - width, height); + const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); return SumSquareErrorToPsnr(sse, samples); } LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, - src_y_b, stride_y_b, - width, height); +double I420Psnr(const uint8* src_y_a, + int stride_y_a, + const uint8* src_u_a, + int stride_u_a, + const uint8* src_v_a, + int stride_v_a, + const uint8* src_y_b, + int stride_y_b, + const uint8* src_u_b, + int stride_u_b, + const uint8* src_v_b, + int stride_v_b, + int width, + int height) { + const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b, + stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, - src_u_b, stride_u_b, - width_uv, height_uv); - const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, - src_v_b, stride_v_b, - width_uv, height_uv); + const uint64 sse_u = ComputeSumSquareErrorPlane( + src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); + const uint64 sse_v = ComputeSumSquareErrorPlane( + src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); const uint64 samples = width * height + 2 * (width_uv * height_uv); const uint64 sse = sse_y + sse_u + sse_v; return SumSquareErrorToPsnr(sse, samples); } -static const int64 cc1 = 26634; // (64^2*(.01*255)^2 +static const int64 cc1 = 26634; // (64^2*(.01*255)^2 static const int64 cc2 = 239708; // (64^2*(.03*255)^2 -static double Ssim8x8_C(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) { +static double Ssim8x8_C(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b) { int64 sum_a = 0; int64 sum_b = 0; int64 sum_sq_a = 0; @@ -270,12 +280,12 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); - const int64 sum_a_sq = sum_a*sum_a; - const int64 sum_b_sq = sum_b*sum_b; + const int64 sum_a_sq = sum_a * sum_a; + const int64 sum_b_sq = sum_b * sum_b; - const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * - (count * sum_sq_a - sum_a_sq + - count * sum_sq_b - sum_b_sq + c2); + const int64 ssim_d = + (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); if (ssim_d == 0.0) { return DBL_MAX; @@ -288,13 +298,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { +double CalcFrameSsim(const uint8* src_a, + int stride_a, + const uint8* src_b, + int stride_b, + int width, + int height) { int samples = 0; double ssim_total = 0; - double (*Ssim8x8)(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) = Ssim8x8_C; + double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b, + int stride_b) = Ssim8x8_C; // sample point start with each 4x4 location int i; @@ -314,22 +327,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a, } LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, - src_y_b, stride_y_b, width, height); +double I420Ssim(const uint8* src_y_a, + int stride_y_a, + const uint8* src_u_a, + int stride_u_a, + const uint8* src_v_a, + int stride_v_a, + const uint8* src_y_b, + int stride_y_b, + const uint8* src_u_b, + int stride_u_b, + const uint8* src_v_b, + int stride_v_b, + int width, + int height) { + const double ssim_y = + CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, - src_u_b, stride_u_b, + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); - const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, - src_v_b, stride_v_b, + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); } diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc index 1b83edb1..64522aaa 100644 --- a/files/source/compare_gcc.cc +++ b/files/source/compare_gcc.cc @@ -62,30 +62,30 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { return sse; } -static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 static uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; static uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; static uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; static uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { @@ -148,4 +148,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { } // extern "C" } // namespace libyuv #endif - diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc index dc86fe25..b17fc8e1 100644 --- a/files/source/compare_win.cc +++ b/files/source/compare_win.cc @@ -21,12 +21,12 @@ extern "C" { // This module is for 32 bit Visual C x86 and clangcl #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) -__declspec(naked) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { +__declspec(naked) uint32 + SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 @@ -61,13 +61,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable: 4752) -__declspec(naked) -uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { +#pragma warning(disable : 4752) +__declspec(naked) uint32 + SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count vpxor ymm0, ymm0, ymm0 // sum vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax @@ -101,65 +101,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { } #endif // _MSC_VER >= 1700 -uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -__declspec(naked) -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32 + HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck + pxor xmm7, xmm7 // constant 0 for unpck movdqa xmm6, xmmword ptr kHash16x33 wloop: - movdqu xmm1, [eax] // src[0-15] + movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 + pmulld xmm0, xmm6 // hash *= 33 ^ 16 movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] + punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] + punpcklwd xmm3, xmm7 // src[0-3] pmulld xmm3, xmm5 movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] + punpckhwd xmm4, xmm7 // src[4-7] pmulld xmm4, xmm5 movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] + punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] + punpcklwd xmm2, xmm7 // src[8-11] pmulld xmm2, xmm5 movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] + punpckhwd xmm1, xmm7 // src[12-15] pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results + paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 paddd xmm1, xmm3 @@ -171,18 +171,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - movd eax, xmm0 // return hash + movd eax, xmm0 // return hash ret } } // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) -uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32 + HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count vmovd xmm0, [esp + 12] // seed wloop: @@ -196,7 +196,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm3, xmm3, xmm4 // add 16 results vpaddd xmm1, xmm1, xmm2 vpaddd xmm1, xmm1, xmm3 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords @@ -207,7 +207,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - vmovd eax, xmm0 // return hash + vmovd eax, xmm0 // return hash vzeroupper ret } diff --git a/files/source/convert.cc b/files/source/convert.cc index e332bc50..f79acaca 100644 --- a/files/source/convert.cc +++ b/files/source/convert.cc @@ -28,31 +28,37 @@ static __inline int Abs(int v) { } // Any I4xx To I420 format with mirroring. -static int I4xxToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int src_uv_width, int src_uv_height) { +static int I4xxToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int src_uv_width, + int src_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); - if (src_y_width == 0 || src_y_height == 0 || - src_uv_width == 0 || src_uv_height == 0) { + if (src_uv_width == 0 || src_uv_height == 0) { return -1; } - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } @@ -60,18 +66,23 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y, // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure // is does row coalescing. LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -98,76 +109,63 @@ int I420Copy(const uint8* src_y, int src_stride_y, // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { const int src_uv_width = SUBSAMPLE(width, 1, 1); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, src_uv_width, height); } // 444 chroma is 1x width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - width, height); -} - -// 411 chroma is 1/4 width, 1x height -// 420 chroma is 1/2 width, 1/2 height -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int src_uv_width = SUBSAMPLE(width, 3, 2); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); +int I444ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, width, height); } // I400 is greyscale typically used in MJPG LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I400ToI420(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -177,15 +175,21 @@ int I400ToI420(const uint8* src_y, int src_stride_y, src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); return 0; } -static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride, - int width, int height) { +static void CopyPlane2(const uint8* src, + int src_stride_0, + int src_stride_1, + uint8* dst, + int dst_stride, + int width, + int height) { int y; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_SSE2) @@ -236,27 +240,30 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // The UV plane is half width, but 2 values, so src_stride_m420 applies to // this as well as the two Y planes. static int X420ToI420(const uint8* src_y, - int src_stride_y0, int src_stride_y1, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - int y; + int src_stride_y0, + int src_stride_y1, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) = SplitUVRow_C; - if (!src_y || !src_uv || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; + if (dst_y) { + dst_y = dst_y + (height - 1) * dst_stride_y; + } dst_u = dst_u + (halfheight - 1) * dst_stride_u; dst_v = dst_v + (halfheight - 1) * dst_stride_v; dst_stride_y = -dst_stride_y; @@ -264,56 +271,19 @@ static int X420ToI420(const uint8* src_y, dst_stride_v = -dst_stride_v; } // Coalesce rows. - if (src_stride_y0 == width && - src_stride_y1 == width && + if (src_stride_y0 == width && src_stride_y1 == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = dst_stride_y = 0; } // Coalesce rows. - if (src_stride_uv == halfwidth * 2 && - dst_stride_u == halfwidth && + if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && dst_stride_v == halfwidth) { halfwidth *= halfheight; halfheight = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_SPLITUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - SplitUVRow = SplitUVRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_NEON; - } - } -#endif -#if defined(HAS_SPLITUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_Any_DSPR2; - if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_DSPR2; - } - } -#endif if (dst_y) { if (src_stride_y0 == src_stride_y1) { @@ -324,75 +294,86 @@ static int X420ToI420(const uint8* src_y, } } - for (y = 0; y < halfheight; ++y) { - // Copy a row of UV. - SplitUVRow(src_uv, dst_u, dst_v, halfwidth); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - src_uv += src_stride_uv; - } + // Split UV plane - NV12 / NV21 + SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; } // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); +int NV12ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); } // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_vu, src_stride_vu, - dst_y, dst_stride_y, - dst_v, dst_stride_v, - dst_u, dst_stride_u, - width, height); +int NV21ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_vu, + int src_stride_vu, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, + dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, width, height); } // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int M420ToI420(const uint8* src_m420, + int src_stride_m420, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); } // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI420(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, - uint8* dst_y, int width) = YUY2ToYRow_C; + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, + uint8* dst_v, int width) = YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -429,6 +410,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -448,16 +439,21 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI420(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, + uint8* dst_v, int width) = UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) = + UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -494,6 +490,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -513,19 +519,22 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, // Convert ARGB to I420. LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI420(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -570,6 +579,38 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -589,19 +630,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, // Convert BGRA to I420. LIBYUV_API -int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int BGRAToI420(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; + void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u, + uint8* dst_v, int width) = BGRAToUVRow_C; void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) = BGRAToYRow_C; - if (!src_bgra || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -629,12 +673,44 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_BGRATOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - BGRAToUVRow = BGRAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; } + } +#endif +#if defined(HAS_BGRATOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + BGRAToYRow = BGRAToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_DSPR2; + } + } +#endif +#if defined(HAS_BGRATOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + BGRAToUVRow = BGRAToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_BGRATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToUVRow = BGRAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MSA; + } + } #endif for (y = 0; y < height - 1; y += 2) { @@ -655,19 +731,22 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, // Convert ABGR to I420. LIBYUV_API -int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ABGRToI420(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; + void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u, + uint8* dst_v, int width) = ABGRToUVRow_C; void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) = ABGRToYRow_C; - if (!src_abgr || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -702,6 +781,38 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ABGRToYRow = ABGRToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ABGRToUVRow = ABGRToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -721,19 +832,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, // Convert RGBA to I420. LIBYUV_API -int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGBAToI420(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; + void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u, + uint8* dst_v, int width) = RGBAToUVRow_C; void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) = RGBAToYRow_C; - if (!src_rgba || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -768,6 +882,38 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, } } #endif +#if defined(HAS_RGBATOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RGBAToYRow = RGBAToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_DSPR2; + } + } +#endif +#if defined(HAS_RGBATOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RGBAToUVRow = RGBAToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_RGBATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToUVRow = RGBAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -787,27 +933,31 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, // Convert RGB24 to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB24ToI420(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB24TOYROW_NEON) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) = RGB24ToYRow_C; #else void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB24ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -829,6 +979,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } } +#elif defined(HAS_RGB24TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -865,63 +1024,67 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); - RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RGB24TOYROW_NEON) - RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); #else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } -#if !defined(HAS_RGB24TOYROW_NEON) - free_aligned_buffer_64(row); } +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + free_aligned_buffer_64(row); +} #endif - return 0; +return 0; } // Convert RAW to I420. LIBYUV_API -int RAWToI420(const uint8* src_raw, int src_stride_raw, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RAWToI420(const uint8* src_raw, + int src_stride_raw, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RAWTOYROW_NEON) - void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u, + uint8* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) = RAWToYRow_C; #else void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_raw || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -943,99 +1106,121 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } } +#elif defined(HAS_RAWTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } } - } #endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } } - } #endif #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } } - } #endif - { - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); +#if defined(HAS_RAWTOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RAWToARGBRow = RAWToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 4)) { + RAWToARGBRow = RAWToARGBRow_DSPR2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); - RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) - RAWToUVRow(src_raw, 0, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); #else - RAWToARGBRow(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } -#if !defined(HAS_RAWTOYROW_NEON) - free_aligned_buffer_64(row); } +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + free_aligned_buffer_64(row); +} #endif - return 0; +return 0; } // Convert RGB565 to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB565ToI420(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB565TOYROW_NEON) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = + RGB565ToUVRow_C; void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) = RGB565ToYRow_C; #else void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb565 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1057,107 +1242,130 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } } } +#elif defined(HAS_RGB565TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; - } - } + if (TestCpuFlag(kCpuHasSSE2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } #endif #if defined(HAS_RGB565TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_AVX2; - } - } + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } #endif #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } #endif - { - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); +#if defined(HAS_RGB565TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_DSPR2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); - RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_RGB565TOYROW_NEON) - RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); #else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } -#if !defined(HAS_RGB565TOYROW_NEON) - free_aligned_buffer_64(row); } +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + free_aligned_buffer_64(row); +} #endif - return 0; +return 0; } // Convert ARGB1555 to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB1555ToI420(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_ARGB1555TOYROW_NEON) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = + ARGB1555ToUVRow_C; void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) = ARGB1555ToYRow_C; #else void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = ARGB1555ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb1555 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1179,109 +1387,124 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } } +#elif defined(HAS_ARGB1555TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; - } - } + if (TestCpuFlag(kCpuHasSSE2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } #endif #if defined(HAS_ARGB1555TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; - } - } + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } #endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; + } + } #endif #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif - { - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, - width); + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_argb1555 += src_stride_argb1555 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if defined(HAS_ARGB1555TOYROW_NEON) - ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); #else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); #endif - } -#if !defined(HAS_ARGB1555TOYROW_NEON) - free_aligned_buffer_64(row); } +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + free_aligned_buffer_64(row); +} #endif - return 0; +return 0; } // Convert ARGB4444 to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB4444ToI420(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; #if defined(HAS_ARGB4444TOYROW_NEON) void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; + uint8* dst_u, uint8* dst_v, int width) = + ARGB4444ToUVRow_C; void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) = ARGB4444ToYRow_C; #else void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = ARGB4444ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb4444 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1321,6 +1544,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1341,18 +1572,30 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif { // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, - width); + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); #else ARGB4444ToARGBRow(src_argb4444, row, width); ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, @@ -1361,25 +1604,107 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_argb4444 += src_stride_argb4444 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_ARGB4444TOYROW_NEON) - ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); #else ARGB4444ToARGBRow(src_argb4444, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif - } -#if !defined(HAS_ARGB4444TOYROW_NEON) - free_aligned_buffer_64(row); } +#if !defined(HAS_ARGB4444TOYROW_NEON) + free_aligned_buffer_64(row); +} #endif +return 0; +} + +static void SplitPixels(const uint8* src_u, + int src_pixel_stride_uv, + uint8* dst_u, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst_u = *src_u; + ++dst_u; + src_u += src_pixel_stride_uv; + } +} + +// Convert Android420 to I420. +LIBYUV_API +int Android420ToI420(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // Copy UV planes as is - I420 + if (src_pixel_stride_uv == 1) { + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; + // Split UV planes - NV21 + } else if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, + halfwidth, halfheight); + return 0; + // Split UV planes - NV12 + } else if (src_pixel_stride_uv == 2 && vu_off == 1 && + src_stride_u == src_stride_v) { + SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + } + + for (y = 0; y < halfheight; ++y) { + SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); + SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } return 0; } diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc index fb9582d6..5007bdb9 100644 --- a/files/source/convert_argb.cc +++ b/files/source/convert_argb.cc @@ -26,11 +26,13 @@ extern "C" { // Copy ARGB with optional flipping LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (!src_argb || !dst_argb || - width <= 0 || height == 0) { +int ARGBCopy(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } - CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width * 4, height); + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, + height); return 0; } // Convert I422 to ARGB with matrix -static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I420ToARGBMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -102,6 +106,14 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -117,111 +129,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I420 to ABGR. LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H420ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H420ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I422ToARGBMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -231,10 +262,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; @@ -272,6 +301,14 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -285,111 +322,130 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I422 to ABGR. LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H422ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H422ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I444ToARGBMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I444ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I444ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -399,9 +455,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u == width && - src_stride_v == width && + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -431,6 +485,22 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I444TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + I444ToARGBRow = I444ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -444,138 +514,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I444ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I444ToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); -} - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - int y; - void (*I411ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I411ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 4 == width && - src_stride_v * 4 == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I411TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I411ToARGBRow = I411ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I411TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I411ToARGBRow = I411ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I411ToARGBRow = I411ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I411TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I411ToARGBRow = I411ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; +int J444ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, +static int I420AlphaToARGBMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + const uint8* src_a, + int src_stride_a, + uint8* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height, int attenuate) { + int width, + int height, + int attenuate) { int y; - void (*I422AlphaToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, + void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, const uint8* a_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -617,6 +630,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; } #endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -641,6 +662,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -661,49 +690,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 with Alpha to ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_a, src_stride_a, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height, attenuate); +int I420AlphaToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + const uint8* src_a, + int src_stride_a, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); } // Convert I420 with Alpha to ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - src_a, src_stride_a, - dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); +int I420AlphaToABGR(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + const uint8* src_a, + int src_stride_a, + uint8* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); } // Convert I400 to ARGB. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int I400ToARGB(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*I400ToARGBRow)(const uint8* y_buf, - uint8* rgb_buf, - int width) = I400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -713,8 +752,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -743,6 +781,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); @@ -754,14 +800,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, // Convert J400 to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int J400ToARGB(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) = J400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -771,8 +819,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -801,6 +848,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; @@ -810,85 +865,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, } // Shuffle table for converting BGRA to ARGB. -static uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; +static uvec8 kShuffleMaskBGRAToARGB = {3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, + 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; // Shuffle table for converting ABGR to ARGB. -static uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; +static uvec8 kShuffleMaskABGRToARGB = {2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, + 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; // Shuffle table for converting RGBA to ARGB. -static uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u -}; +static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, + 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; // Convert BGRA to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int BGRAToARGB(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). LIBYUV_API -int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int ARGBToBGRA(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ABGR to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ABGRToARGB(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). LIBYUV_API -int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ARGBToABGR(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), width, height); } // Convert RGBA to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_rgba, src_stride_rgba, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskRGBAToARGB), - width, height); +int RGBAToARGB(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskRGBAToARGB), width, height); } // Convert RGB24 to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB24ToARGB(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RGB24ToARGBRow_C; - if (!src_rgb24 || !dst_argb || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -898,8 +957,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = -src_stride_rgb24; } // Coalesce rows. - if (src_stride_rgb24 == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb24 = dst_stride_argb = 0; @@ -920,6 +978,22 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -931,14 +1005,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, // Convert RAW to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_raw, int src_stride_raw, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RAWToARGB(const uint8* src_raw, + int src_stride_raw, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = RAWToARGBRow_C; - if (!src_raw || !dst_argb || - width <= 0 || height == 0) { + if (!src_raw || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -948,8 +1024,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_raw = dst_stride_argb = 0; @@ -970,6 +1045,22 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RAWToARGBRow = RAWToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -981,14 +1072,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, // Convert RGB565 to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB565ToARGB(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) = RGB565ToARGBRow_C; - if (!src_rgb565 || !dst_argb || - width <= 0 || height == 0) { + if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -998,8 +1091,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = -src_stride_rgb565; } // Coalesce rows. - if (src_stride_rgb565 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb565 = dst_stride_argb = 0; @@ -1028,6 +1120,22 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -1039,14 +1147,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, // Convert ARGB1555 to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB1555ToARGB(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, - int width) = ARGB1555ToARGBRow_C; - if (!src_argb1555 || !dst_argb || - width <= 0 || height == 0) { + int width) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1056,8 +1166,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = -src_stride_argb1555; } // Coalesce rows. - if (src_stride_argb1555 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb1555 = dst_stride_argb = 0; @@ -1086,6 +1195,22 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 4)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -1097,14 +1222,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, // Convert ARGB4444 to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB4444ToARGB(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, - int width) = ARGB4444ToARGBRow_C; - if (!src_argb4444 || !dst_argb || - width <= 0 || height == 0) { + int width) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1114,8 +1241,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = -src_stride_argb4444; } // Coalesce rows. - if (src_stride_argb4444 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb4444 = dst_stride_argb = 0; @@ -1144,6 +1270,22 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 4)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -1155,18 +1297,19 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, // Convert NV12 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int NV12ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1199,6 +1342,22 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); @@ -1213,18 +1372,19 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, // Convert NV21 to ARGB. LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int NV21ToARGB(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV21ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV21ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + NV21ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1257,6 +1417,14 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); @@ -1271,17 +1439,17 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int M420ToARGB(const uint8* src_m420, + int src_stride_m420, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_m420 || !dst_argb || - width <= 0 || height == 0) { + void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1314,6 +1482,22 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, } } #endif +#if defined(HAS_NV12TOARGBROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_DSPR2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, @@ -1332,17 +1516,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int YUY2ToARGB(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*YUY2ToARGBRow)(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) = YUY2ToARGBRow_C; - if (!src_yuy2 || !dst_argb || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1352,8 +1536,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_yuy2 = dst_stride_argb = 0; @@ -1382,6 +1565,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); src_yuy2 += src_stride_yuy2; @@ -1392,17 +1583,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int UYVYToARGB(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*UYVYToARGBRow)(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, + const struct YuvConstants* yuvconstants, int width) = UYVYToARGBRow_C; - if (!src_uyvy || !dst_argb || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1412,8 +1603,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_uyvy = dst_stride_argb = 0; @@ -1442,6 +1632,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); src_uyvy += src_stride_uyvy; diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc index 46abdebc..e6ff5243 100644 --- a/files/source/convert_from.cc +++ b/files/source/convert_from.cc @@ -30,107 +30,100 @@ static __inline int Abs(int v) { } // I420 To any I4xx YUV format with mirroring. -static int I420ToI4xx(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int dst_uv_width, int dst_uv_height) { +static int I420ToI4xx(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int dst_uv_width, + int dst_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); - if (src_y_width == 0 || src_y_height == 0 || - dst_uv_width <= 0 || dst_uv_height <= 0) { + if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || + dst_uv_height <= 0) { return -1; } - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + if (dst_y) { + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI422(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = (Abs(width) + 1) >> 1; const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // 420 chroma is 1/2 width, 1/2 height // 444 chroma is 1x width, 1x height LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI444(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = Abs(width); const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); -} - -// 420 chroma is 1/2 width, 1/2 height -// 411 chroma is 1/4 width, 1x height -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int dst_uv_width = (Abs(width) + 3) >> 2; - const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Copy(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -144,17 +137,21 @@ int I400Copy(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I422ToYUY2(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -164,10 +161,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_yuy2 == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; @@ -200,17 +195,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I420ToYUY2(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -235,6 +234,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -252,17 +259,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I422ToUYVY(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -272,10 +283,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_uyvy == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; @@ -296,6 +305,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -308,17 +325,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I420ToUYVY(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -343,6 +364,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -359,113 +388,70 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, return 0; } +// TODO(fbarchard): test negative height for invert. LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { - int y; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - // Coalesce rows. - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || - width <= 0 || height == 0) { +int I420ToNV12(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { return -1; } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv; - dst_stride_y = -dst_stride_y; - dst_stride_uv = -dst_stride_uv; - } - if (src_stride_y == width && - dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - // Coalesce rows. - if (src_stride_u == halfwidth && - src_stride_v == halfwidth && - dst_stride_uv == halfwidth * 2) { - halfwidth *= halfheight; - halfheight = 1; - src_stride_u = src_stride_v = dst_stride_uv = 0; - } -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow_ = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_ = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_NEON; - } - } -#endif - - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - for (y = 0; y < halfheight; ++y) { - // Merge a row of U and V into a row of UV. - MergeUVRow_(src_u, src_v, dst_uv, halfwidth); - src_u += src_stride_u; - src_v += src_stride_v; - dst_uv += dst_stride_uv; + int halfwidth = (width + 1) / 2; + int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2; + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } + MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, + halfwidth, halfheight); return 0; } LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height) { - return I420ToNV12(src_y, src_stride_y, - src_v, src_stride_v, - src_u, src_stride_u, - dst_y, dst_stride_y, - dst_vu, dst_stride_vu, +int I420ToNV21(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I420ToRGBAMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -507,6 +493,14 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -522,50 +516,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGBA. LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I420ToRGBA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I420 to BGRA. LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I420ToBGRA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, +static int I420ToRGB24Matrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgb24, + int dst_stride_rgb24, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGB24Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || - width <= 0 || height == 0) { + void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -598,6 +600,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -613,50 +623,59 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGB24. LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, - width, height); +int I420ToRGB24(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); } // Convert I420 to RAW. LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_raw, dst_stride_raw, +int I420ToRAW(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert I420 to ARGB1555. LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int I420ToARGB1555(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; - void (*I422ToARGB1555Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -689,6 +708,22 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2; + if (IS_ALIGNED(width, 4)) { + I422ToARGB1555Row = I422ToARGB1555Row_DSPR2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, @@ -703,23 +738,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, return 0; } - // Convert I420 to ARGB4444. LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int I420ToARGB4444(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; - void (*I422ToARGB4444Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -752,6 +789,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2; + if (IS_ALIGNED(width, 4)) { + I422ToARGB4444Row = I422ToARGB4444Row_DSPR2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, @@ -768,20 +821,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, // Convert I420 to RGB565. LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int I420ToRGB565(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*I422ToRGB565Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -814,6 +869,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); @@ -829,30 +892,31 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert I420 to RGB565 with dithering. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int I420ToRGB565Dither(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgb565, + int dst_stride_rgb565, + const uint8* dither4x4, + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + const uint32 dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -896,6 +960,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; @@ -920,13 +992,22 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif { // Allocate a row of argb. align_buffer_64(row_argb, width * 4); for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(uint32*)(dither4x4 + ((y & 3) << 2)), + width); // NOLINT dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -941,218 +1022,156 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, // Convert I420 to specified format LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, +int ConvertFromI420(const uint8* y, + int y_stride, + const uint8* u, + int u_stride, + const uint8* v, + int v_stride, + uint8* dst_sample, + int dst_sample_stride, + int width, + int height, uint32 fourcc) { uint32 format = CanonicalFourCC(fourcc); int r = 0; - if (!y || !u|| !v || !dst_sample || - width <= 0 || height == 0) { + if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { return -1; } switch (format) { // Single plane formats case FOURCC_YUY2: - r = I420ToYUY2(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_UYVY: - r = I420ToUYVY(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBP: - r = I420ToRGB565(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBO: - r = I420ToARGB1555(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_R444: - r = I420ToARGB4444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_24BG: - r = I420ToRGB24(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_RAW: - r = I420ToRAW(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_ARGB: - r = I420ToARGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_BGRA: - r = I420ToBGRA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_ABGR: - r = I420ToABGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_RGBA: - r = I420ToRGBA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_I400: - r = I400Copy(y, y_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I400Copy(y, y_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; case FOURCC_NV12: { uint8* dst_uv = dst_sample + width * height; - r = I420ToNV12(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_uv, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_uv, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } case FOURCC_NV21: { uint8* dst_vu = dst_sample + width * height; - r = I420ToNV21(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_vu, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_vu, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } // TODO(fbarchard): Add M420. // Triplanar formats - // TODO(fbarchard): halfstride instead of halfwidth case FOURCC_I420: case FOURCC_YV12: { - int halfwidth = (width + 1) / 2; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; int halfheight = (height + 1) / 2; uint8* dst_u; uint8* dst_v; if (format == FOURCC_YV12) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * halfheight; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * halfheight; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * halfheight; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * halfheight; } - r = I420Copy(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I422: case FOURCC_YV16: { - int halfwidth = (width + 1) / 2; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; uint8* dst_u; uint8* dst_v; if (format == FOURCC_YV16) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * height; } - r = I420ToI422(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I444: case FOURCC_YV24: { + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; uint8* dst_u; uint8* dst_v; if (format == FOURCC_YV24) { - dst_v = dst_sample + width * height; - dst_u = dst_v + width * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + dst_sample_stride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + width * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + dst_sample_stride * height; } - r = I420ToI444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, width, - dst_v, width, - width, height); + r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, dst_sample_stride, dst_v, + dst_sample_stride, width, height); break; } - case FOURCC_I411: { - int quarterwidth = (width + 3) / 4; - uint8* dst_u = dst_sample + width * height; - uint8* dst_v = dst_u + quarterwidth * height; - r = I420ToI411(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, quarterwidth, - dst_v, quarterwidth, - width, height); - break; - } - // Formats not supported - MJPG, biplanar, some rgb formats. default: return -1; // unknown fourcc - return failure code. diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc index 2a8682b7..88f38279 100644 --- a/files/source/convert_from_argb.cc +++ b/files/source/convert_from_argb.cc @@ -22,16 +22,21 @@ extern "C" { // ARGB little endian (bgra in memory) to I444 LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI444(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV444Row_C; + int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u == width && - dst_stride_v == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } } #endif #if defined(HAS_ARGBTOUV444ROW_NEON) @@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -89,6 +100,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -103,19 +130,22 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, // ARGB little endian (bgra in memory) to I422 LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI422(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -125,10 +155,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -169,82 +197,42 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } #endif - - for (y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// ARGB little endian (bgra in memory) to I411 -LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - int y; - void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV411Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 4 == width && - dst_stride_v * 4 == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; } } #endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; + +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; } } #endif -#if defined(HAS_ARGBTOUV411ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; + ARGBToUVRow = ARGBToUVRow_MSA; } } #endif for (y = 0; y < height; ++y) { - ARGBToUV411Row(src_argb, dst_u, dst_v, width); + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; @@ -255,21 +243,23 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, } LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV12(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -314,6 +304,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -338,6 +344,30 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -364,21 +394,23 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, // Same as NV12 but U and V swapped. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV21(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -423,6 +455,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -447,6 +495,30 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); @@ -473,19 +545,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, // Convert ARGB to YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int ARGBToYUY2(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; - if (!src_argb || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -495,8 +570,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yuy2 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_yuy2 = 0; @@ -537,6 +611,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -553,6 +643,30 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif { // Allocate a rows of yuv. @@ -575,19 +689,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, // Convert ARGB to UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int ARGBToUYVY(const uint8* src_argb, + int src_stride_argb, + uint8* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u, + uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; - if (!src_argb || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -597,8 +714,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_uyvy == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_uyvy = 0; @@ -639,6 +755,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -655,6 +787,30 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToUVRow = ARGBToUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_DSPR2; + } + } +#endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif { // Allocate a rows of yuv. @@ -677,9 +833,12 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int ARGBToI400(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = ARGBToYRow_C; @@ -692,8 +851,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = 0; @@ -722,6 +880,22 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ARGBToYRow = ARGBToYRow_Any_DSPR2; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_DSPR2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -732,26 +906,29 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } // Shuffle table for converting ARGB to RGBA. -static uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u -}; +static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, + 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; // Convert ARGB to RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return ARGBShuffle(src_argb, src_stride_argb, - dst_rgba, dst_stride_rgba, - (const uint8*)(&kShuffleMaskARGBToRGBA), - width, height); +int ARGBToRGBA(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, + (const uint8*)(&kShuffleMaskARGBToRGBA), width, height); } // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int ARGBToRGB24(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRGB24Row_C; @@ -764,8 +941,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb24 == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_rgb24 = 0; @@ -786,6 +962,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -797,9 +981,12 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { +int ARGBToRAW(const uint8* src_argb, + int src_stride_argb, + uint8* dst_raw, + int dst_stride_raw, + int width, + int height) { int y; void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRAWRow_C; @@ -812,8 +999,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_raw == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_raw = 0; @@ -834,6 +1020,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -845,20 +1039,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int ARGBToRGB565Dither(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb565, + int dst_stride_rgb565, + const uint8* dither4x4, + int width, + int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; + const uint32 dither4, int width) = + ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -894,9 +1090,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(uint32*)(dither4x4 + ((y & 3) << 2)), + width); /* NOLINT */ src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } @@ -906,9 +1112,12 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RGB565. // TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int ARGBToRGB565(const uint8* src_argb, + int src_stride_argb, + uint8* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToRGB565Row_C; @@ -921,8 +1130,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb565 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_rgb565 = 0; @@ -951,6 +1159,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -962,9 +1178,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int ARGBToARGB1555(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToARGB1555Row_C; @@ -977,8 +1196,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb1555 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb1555 = 0; @@ -1007,6 +1225,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1018,9 +1244,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int ARGBToARGB4444(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) = ARGBToARGB4444Row_C; @@ -1033,8 +1262,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb4444 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb4444 = 0; @@ -1063,6 +1291,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1074,19 +1310,22 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ420(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1129,6 +1368,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -1148,19 +1403,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J422. (JPeg full range I422). LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ422(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1170,10 +1428,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; @@ -1212,6 +1468,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -1226,9 +1498,12 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J400. LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height) { +int ARGBToJ400(const uint8* src_argb, + int src_stride_argb, + uint8* dst_yj, + int dst_stride_yj, + int width, + int height) { int y; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = ARGBToYJRow_C; @@ -1241,8 +1516,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = 0; @@ -1271,6 +1545,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc index 90f550a2..216a9f26 100644 --- a/files/source/convert_jpeg.cc +++ b/files/source/convert_jpeg.cc @@ -37,13 +37,9 @@ static void JpegCopyI420(void* opaque, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I420Copy(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -55,13 +51,9 @@ static void JpegI422ToI420(void* opaque, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I422ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -73,31 +65,9 @@ static void JpegI444ToI420(void* opaque, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I444ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI411ToI420(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I411ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -109,11 +79,8 @@ static void JpegI400ToI420(void* opaque, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I400ToI420(data[0], strides[0], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, + dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -122,8 +89,7 @@ static void JpegI400ToI420(void* opaque, // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height) { +int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) { MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret) { @@ -139,11 +105,16 @@ int MJPGSize(const uint8* sample, size_t sample_size, LIBYUV_API int MJPGToI420(const uint8* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int w, int h, - int dw, int dh) { + uint8* y, + int y_stride, + uint8* u, + int u_stride, + uint8* v, + int v_stride, + int w, + int h, + int dw, + int dh) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -152,17 +123,16 @@ int MJPGToI420(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && + (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -171,7 +141,7 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); - // YUV422 + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -182,7 +152,7 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); - // YUV444 + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -193,18 +163,7 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); - // YUV400 + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && @@ -213,7 +172,7 @@ int MJPGToI420(const uint8* sample, ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; @@ -231,57 +190,34 @@ struct ARGBBuffers { }; static void JpegI420ToARGB(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I420ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI422ToARGB(void* opaque, const uint8* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I422ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } -static void JpegI444ToARGB(void* opaque, +static void JpegI422ToARGB(void* opaque, const uint8* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I444ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } -static void JpegI411ToARGB(void* opaque, +static void JpegI444ToARGB(void* opaque, const uint8* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I411ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } @@ -291,9 +227,7 @@ static void JpegI400ToARGB(void* opaque, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I400ToARGB(data[0], strides[0], - dest->argb, dest->argb_stride, - dest->w, rows); + I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } @@ -303,9 +237,12 @@ static void JpegI400ToARGB(void* opaque, LIBYUV_API int MJPGToARGB(const uint8* sample, size_t sample_size, - uint8* argb, int argb_stride, - int w, int h, - int dw, int dh) { + uint8* argb, + int argb_stride, + int w, + int h, + int dw, + int dh) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -314,17 +251,16 @@ int MJPGToARGB(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && + (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + ARGBBuffers bufs = {argb, argb_stride, dw, dh}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -333,7 +269,7 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); - // YUV422 + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -344,7 +280,7 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); - // YUV444 + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -355,18 +291,7 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); - // YUV400 + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && @@ -375,7 +300,7 @@ int MJPGToARGB(const uint8* sample, ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc index bccb34c3..63a5104b 100644 --- a/files/source/convert_to_argb.cc +++ b/files/source/convert_to_argb.cc @@ -29,11 +29,16 @@ extern "C" { // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. LIBYUV_API -int ConvertToARGB(const uint8* sample, size_t sample_size, - uint8* crop_argb, int argb_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8* sample, + size_t sample_size, + uint8* crop_argb, + int argb_stride, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, uint32 fourcc) { uint32 format = CanonicalFourCC(fourcc); @@ -49,16 +54,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, // and then rotate the ARGB to the final destination buffer. // For in-place conversion, if destination crop_argb is same as source sample, // also enable temporary buffer. - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || - crop_argb == sample; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_ARGB) || crop_argb == sample; uint8* dest_argb = crop_argb; int dest_argb_stride = argb_stride; uint8* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - if (crop_argb == NULL || sample == NULL || - src_width <= 0 || crop_width <= 0 || - src_height == 0 || crop_height == 0) { + if (crop_argb == NULL || sample == NULL || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } if (src_height < 0) { @@ -67,7 +71,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { int argb_size = crop_width * 4 * abs_crop_height; - rotate_buffer = (uint8*)malloc(argb_size); + rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } @@ -79,102 +83,85 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_ARGB: - if (!need_buf && !rotation ) { + if (!need_buf && !rotation) { src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + inv_crop_height); } break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToARGB(src, src_width * 2, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToARGB(src, src_width * 2, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToARGB(src, src_width * 2, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, + inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width, + inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; - r = NV12ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb, + argb_stride, crop_width, inv_crop_height); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; // Call NV12 but with u and v parameters swapped. - r = NV21ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb, + argb_stride, crop_width, inv_crop_height); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width, + inv_crop_height); break; // Triplanar formats case FOURCC_I420: @@ -186,20 +173,17 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + crop_argb, argb_stride, crop_width, inv_crop_height); break; } @@ -210,14 +194,11 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = J420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + crop_argb, argb_stride, crop_width, inv_crop_height); break; } @@ -228,21 +209,18 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, const uint8* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + crop_argb, argb_stride, crop_width, inv_crop_height); break; } case FOURCC_I444: @@ -257,32 +235,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToARGB(src_y, src_width, - src_u, src_width, - src_v, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToARGB(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + crop_argb, argb_stride, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToARGB(sample, sample_size, - crop_argb, argb_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -291,16 +251,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { if (!r) { - r = ARGBRotate(crop_argb, argb_stride, - dest_argb, dest_argb_stride, + r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride, crop_width, abs_crop_height, rotation); } free(rotate_buffer); } else if (rotation) { src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBRotate(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height, rotation); + r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width, + inv_crop_height, rotation); } return r; diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc index e5f307c4..a50689db 100644 --- a/files/source/convert_to_i420.cc +++ b/files/source/convert_to_i420.cc @@ -27,12 +27,18 @@ extern "C" { LIBYUV_API int ConvertToI420(const uint8* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, + uint8* y, + int y_stride, + uint8* u, + int u_stride, + uint8* v, + int v_stride, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, uint32 fourcc) { uint32 format = CanonicalFourCC(fourcc); @@ -43,9 +49,10 @@ int ConvertToI420(const uint8* sample, // TODO(nisse): Why allow crop_height < 0? const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && - format != FOURCC_NV12 && format != FOURCC_NV21 && - format != FOURCC_YV12) || y == sample; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + y == sample; uint8* tmp_y = y; uint8* tmp_u = u; uint8* tmp_v = v; @@ -56,8 +63,7 @@ int ConvertToI420(const uint8* sample, const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; - if (!y || !u || !v || !sample || - src_width <= 0 || crop_width <= 0 || + if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } @@ -70,7 +76,7 @@ int ConvertToI420(const uint8* sample, if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } @@ -85,130 +91,85 @@ int ConvertToI420(const uint8* sample, // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v, + v_stride, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v, + v_stride, crop_width, inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, + v_stride, crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, + v_stride, crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, + v_stride, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, + r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, + r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_ARGB: src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, + r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, + r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, + r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, + r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, + r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y, + y_stride, u, u_stride, v, v_stride, crop_width, + inv_crop_height, rotation); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); // Call NV12 but with u and v parameters swapped. - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - v, v_stride, - u, u_stride, - crop_width, inv_crop_height, rotation); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y, + y_stride, v, v_stride, u, u_stride, crop_width, + inv_crop_height, rotation); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, + r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride, crop_width, inv_crop_height); break; // Triplanar formats @@ -221,22 +182,18 @@ int ConvertToI420(const uint8* sample, int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420Rotate(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y, + y_stride, u, u_stride, v, v_stride, crop_width, + inv_crop_height, rotation); break; } case FOURCC_I422: @@ -246,23 +203,19 @@ int ConvertToI420(const uint8* sample, const uint8* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToI420(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y, + y_stride, u, u_stride, v, v_stride, crop_width, + inv_crop_height); break; } case FOURCC_I444: @@ -277,37 +230,14 @@ int ConvertToI420(const uint8* sample, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToI420(src_y, src_width, - src_u, src_width, - src_v, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToI420(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y, + y_stride, u, u_stride, v, v_stride, crop_width, + inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToI420(sample, sample_size, - y, y_stride, - u, u_stride, - v, v_stride, + r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride, src_width, abs_src_height, crop_width, inv_crop_height); break; #endif @@ -317,13 +247,9 @@ int ConvertToI420(const uint8* sample, if (need_buf) { if (!r) { - r = I420Rotate(y, y_stride, - u, u_stride, - v, v_stride, - tmp_y, tmp_y_stride, - tmp_u, tmp_u_stride, - tmp_v, tmp_v_stride, - crop_width, abs_crop_height, rotation); + r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride, + tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width, + abs_crop_height, rotation); } free(rotate_buffer); } diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc index 84927ebc..9ff93263 100644 --- a/files/source/cpu_id.cc +++ b/files/source/cpu_id.cc @@ -13,7 +13,7 @@ #if defined(_MSC_VER) #include <intrin.h> // For __cpuidex() #endif -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) #include <immintrin.h> // For _xgetbv() @@ -44,8 +44,8 @@ extern "C" { #endif // Low level cpuid for X86. -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { @@ -68,24 +68,24 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { if (info_ecx == 0) { __cpuid((int*)(cpu_info), info_eax); } else { - cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; } #endif // GCC version uses inline x86 assembly. #else // defined(_MSC_VER) uint32 info_ebx, info_edx; - asm volatile ( -#if defined( __i386__) && defined(__PIC__) - // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=D" (info_ebx), + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), #else - "cpuid \n" - : "=b" (info_ebx), + "cpuid \n" + : "=b"(info_ebx), #endif // defined( __i386__) && defined(__PIC__) - "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); cpu_info[0] = info_eax; cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; @@ -95,6 +95,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + (void)eax; + (void)ecx; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } #endif @@ -111,20 +113,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { #if defined(_M_IX86) && (_MSC_VER < 1900) #pragma optimize("g", off) #endif -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) -#define HAS_XGETBV // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int GetXCR0() { uint32 xcr0 = 0u; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. #elif defined(__i386__) || defined(__x86_64__) - asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) return xcr0; } +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 #endif // defined(_M_IX86) || defined(_M_X64) .. // Return optimization to previous setting. #if defined(_M_IX86) && (_MSC_VER < 1900) @@ -133,8 +137,7 @@ int GetXCR0() { // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU -LIBYUV_API SAFEBUFFERS -int ArmCpuCaps(const char* cpuinfo_name) { +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { @@ -161,6 +164,38 @@ int ArmCpuCaps(const char* cpuinfo_name) { return 0; } +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, + const char ase[]) { + char cpuinfo_line[512]; + int len = (int)strlen(ase); + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // ase enabled if /proc/cpuinfo is unavailable. + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + if (strcmp(ase, " dspr2") == 0) { + return kCpuHasDSPR2; + } + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + char* p = strstr(cpuinfo_line, ase); + if (p && (p[len] == ' ' || p[len] == '\n')) { + fclose(f); + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + if (strcmp(ase, " dspr2") == 0) { + return kCpuHasDSPR2; + } + } + } + } + fclose(f); + return 0; +} + // CPU detect function for SIMD instruction sets. LIBYUV_API int cpu_info_ = 0; // cpu_info is not initialized yet. @@ -184,39 +219,35 @@ static LIBYUV_BOOL TestEnv(const char*) { } #endif -LIBYUV_API SAFEBUFFERS -int InitCpuFlags(void) { - // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized. +LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { int cpu_info = 0; #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) - uint32 cpu_info0[4] = { 0, 0, 0, 0 }; - uint32 cpu_info1[4] = { 0, 0, 0, 0 }; - uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + uint32 cpu_info0[4] = {0, 0, 0, 0}; + uint32 cpu_info1[4] = {0, 0, 0, 0}; + uint32 cpu_info7[4] = {0, 0, 0, 0}; CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); if (cpu_info0[0] >= 7) { CpuId(7, 0, cpu_info7); } - cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - kCpuHasX86; + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); -#ifdef HAS_XGETBV - // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv + // AVX requires OS saves YMM registers. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers - cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; } } -#endif // Environment variable overrides for testing. if (TestEnv("LIBYUV_DISABLE_X86")) { @@ -249,15 +280,25 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_AVX3")) { cpu_info &= ~kCpuHasAVX3; } + if (TestEnv("LIBYUV_DISABLE_F16C")) { + cpu_info &= ~kCpuHasF16C; + } + #endif #if defined(__mips__) && defined(__linux__) #if defined(__mips_dspr2) cpu_info |= kCpuHasDSPR2; #endif +#if defined(__mips_msa) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); +#endif cpu_info |= kCpuHasMIPS; if (getenv("LIBYUV_DISABLE_DSPR2")) { cpu_info &= ~kCpuHasDSPR2; } + if (getenv("LIBYUV_DISABLE_MSA")) { + cpu_info &= ~kCpuHasMSA; + } #endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ @@ -283,7 +324,7 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_ASM")) { cpu_info = 0; } - cpu_info |= kCpuInitialized; + cpu_info |= kCpuInitialized; cpu_info_ = cpu_info; return cpu_info; } diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc index 50818418..b43c008b 100644 --- a/files/source/mjpeg_decoder.cc +++ b/files/source/mjpeg_decoder.cc @@ -21,7 +21,7 @@ #if defined(_MSC_VER) // disable warning 4324: structure was padded due to __declspec(align()) -#pragma warning(disable:4324) +#pragma warning(disable : 4324) #endif #endif @@ -62,6 +62,7 @@ void init_source(jpeg_decompress_struct* cinfo); void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT void term_source(jpeg_decompress_struct* cinfo); void ErrorHandler(jpeg_common_struct* cinfo); +void OutputHandler(jpeg_common_struct* cinfo); MJpegDecoder::MJpegDecoder() : has_scanline_padding_(LIBYUV_FALSE), @@ -77,6 +78,7 @@ MJpegDecoder::MJpegDecoder() decompress_struct_->err = jpeg_std_error(&error_mgr_->base); // Override standard exit()-based error handler. error_mgr_->base.error_exit = &ErrorHandler; + error_mgr_->base.output_message = &OutputHandler; #endif decompress_struct_->client_data = NULL; source_mgr_->init_source = &init_source; @@ -127,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (scanlines_[i]) { delete scanlines_[i]; } - scanlines_[i] = new uint8* [scanlines_size]; + scanlines_[i] = new uint8*[scanlines_size]; scanlines_sizes_[i] = scanlines_size; } @@ -193,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) { } int MJpegDecoder::GetHorizSubSampFactor(int component) { - return decompress_struct_->max_h_samp_factor / - GetHorizSampFactor(component); + return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); } int MJpegDecoder::GetVertSubSampFactor(int component) { - return decompress_struct_->max_v_samp_factor / - GetVertSampFactor(component); + return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); } int MJpegDecoder::GetImageScanlinesPerImcuRow() { @@ -243,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() { } // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. -LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( - uint8** planes, int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -287,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { // TODO(fbarchard): Compute skip to avoid this assert(skip % GetVertSubSampFactor(i) == 0); - int rows_to_skip = - DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - - rows_to_skip; + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = + GetComponentScanlinesPerImcuRow(i) - rows_to_skip; int data_to_skip = rows_to_skip * GetComponentStride(i); - CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), + scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } lines_left -= (GetImageScanlinesPerImcuRow() - skip); @@ -303,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( // Read full MCUs but cropped horizontally for (; lines_left > GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } @@ -326,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } return FinishDecode(); } -LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -393,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, } // Read full MCUs until we get to the crop point. for (; lines_left >= GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; @@ -433,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT } void term_source(j_decompress_ptr cinfo) { - // Nothing to do. + (void)cinfo; // Nothing to do. } #ifdef HAVE_SETJMP void ErrorHandler(j_common_ptr cinfo) { - // This is called when a jpeglib command experiences an error. Unfortunately - // jpeglib's error handling model is not very flexible, because it expects the - // error handler to not return--i.e., it wants the program to terminate. To - // recover from errors we use setjmp() as shown in their example. setjmp() is - // C's implementation for the "call with current continuation" functionality - // seen in some functional programming languages. - // A formatted message can be output, but is unsafe for release. +// This is called when a jpeglib command experiences an error. Unfortunately +// jpeglib's error handling model is not very flexible, because it expects the +// error handler to not return--i.e., it wants the program to terminate. To +// recover from errors we use setjmp() as shown in their example. setjmp() is +// C's implementation for the "call with current continuation" functionality +// seen in some functional programming languages. +// A formatted message can be output, but is unsafe for release. #ifdef DEBUG char buf[JMSG_LENGTH_MAX]; (*cinfo->err->format_message)(cinfo, buf); - // ERROR: Error in jpeglib: buf +// ERROR: Error in jpeglib: buf #endif SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err); @@ -456,7 +454,13 @@ void ErrorHandler(j_common_ptr cinfo) { // and causes it to return (for a second time) with value 1. longjmp(mgr->setjmp_buffer, 1); } -#endif + +// Suppress fprintf warnings. +void OutputHandler(j_common_ptr cinfo) { + (void)cinfo; +} + +#endif // HAVE_SETJMP void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { if (num_outbufs != num_outbufs_) { @@ -465,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { // it. DestroyOutputBuffers(); - scanlines_ = new uint8** [num_outbufs]; + scanlines_ = new uint8**[num_outbufs]; scanlines_sizes_ = new int[num_outbufs]; - databuf_ = new uint8* [num_outbufs]; + databuf_ = new uint8*[num_outbufs]; databuf_strides_ = new int[num_outbufs]; for (int i = 0; i < num_outbufs; ++i) { @@ -483,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { void MJpegDecoder::DestroyOutputBuffers() { for (int i = 0; i < num_outbufs_; ++i) { - delete [] scanlines_[i]; - delete [] databuf_[i]; + delete[] scanlines_[i]; + delete[] databuf_[i]; } - delete [] scanlines_; - delete [] databuf_; - delete [] scanlines_sizes_; - delete [] databuf_strides_; + delete[] scanlines_; + delete[] databuf_; + delete[] scanlines_sizes_; + delete[] databuf_strides_; scanlines_ = NULL; databuf_ = NULL; scanlines_sizes_ = NULL; @@ -535,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) { inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { return (unsigned int)(GetImageScanlinesPerImcuRow()) == - jpeg_read_raw_data(decompress_struct_, - scanlines_, - GetImageScanlinesPerImcuRow()); + jpeg_read_raw_data(decompress_struct_, scanlines_, + GetImageScanlinesPerImcuRow()); } // The helper function which recognizes the jpeg sub-sampling type. JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components) { + int* subsample_x, + int* subsample_y, + int number_of_components) { if (number_of_components == 3) { // Color images. - if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 2 && - subsample_x[2] == 2 && subsample_y[2] == 2) { + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { return kJpegYuv420; } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 1 && - subsample_x[2] == 2 && subsample_y[2] == 1) { + subsample_x[1] == 2 && subsample_y[1] == 1 && + subsample_x[2] == 2 && subsample_y[2] == 1) { return kJpegYuv422; } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 1 && subsample_y[1] == 1 && - subsample_x[2] == 1 && subsample_y[2] == 1) { + subsample_x[1] == 1 && subsample_y[1] == 1 && + subsample_x[2] == 1 && subsample_y[2] == 1) { return kJpegYuv444; } } else if (number_of_components == 1) { // Grey-scale images. @@ -567,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( } // namespace libyuv #endif // HAVE_JPEG - diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc index 9c488320..1a17dd72 100644 --- a/files/source/mjpeg_validate.cc +++ b/files/source/mjpeg_validate.cc @@ -24,7 +24,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { const uint8* it = sample; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. - it = static_cast<const uint8 *>(memchr(it, 0xff, end - it)); + it = static_cast<const uint8*>(memchr(it, 0xff, end - it)); if (it == NULL) { break; } @@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { } // extern "C" } // namespace libyuv #endif - diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc index 237ab683..b8a53e85 100644 --- a/files/source/planar_functions.cc +++ b/files/source/planar_functions.cc @@ -26,14 +26,22 @@ extern "C" { // Copy a plane of data LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -76,15 +84,19 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } } +// TODO(fbarchard): Consider support for negative height. +// TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane_16(const uint16* src_y, + int src_stride_y, + uint16* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -120,17 +132,22 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, // Copy I422. LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -143,7 +160,10 @@ int I422Copy(const uint8* src_y, int src_stride_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); return 0; @@ -151,16 +171,21 @@ int I422Copy(const uint8* src_y, int src_stride_y, // Copy I444. LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { +int I444Copy(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -174,7 +199,9 @@ int I444Copy(const uint8* src_y, int src_stride_y, src_stride_v = -src_stride_v; } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; @@ -182,9 +209,12 @@ int I444Copy(const uint8* src_y, int src_stride_y, // Copy I400. LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I400ToI400(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -200,11 +230,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y, // Convert I420 to I400. LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I420ToI400(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { + (void)src_u; + (void)src_stride_u; + (void)src_v; + (void)src_stride_v; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -214,14 +253,159 @@ int I420ToI400(const uint8* src_y, int src_stride_y, src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } +// Support function for NV12 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane(const uint8* src_uv, + int src_stride_uv, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) = SplitUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) && + IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) && + IS_ALIGNED(dst_stride_v, 4)) { + SplitUVRow = SplitUVRow_Any_DSPR2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_DSPR2; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane(const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow(src_u, src_v, dst_uv, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + // Mirror a plane of data. -void MirrorPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +void MirrorPlane(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; // Negative height means invert the image. @@ -256,12 +440,20 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, #endif // TODO(fbarchard): Mirror on mips handle unaligned memory. #if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) && + IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) && + IS_ALIGNED(dst_stride_y, 4)) { MirrorRow = MirrorRow_DSPR2; } #endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -273,17 +465,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, // Convert YUY2 to I422. LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI422(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUV422Row)(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) = - YUY2ToUV422Row_C; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int width) = YUY2ToUV422Row_C; void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -291,10 +490,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -322,15 +520,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width >= 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUV422Row = YUY2ToUV422Row_NEON; } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -345,17 +551,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I422. LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI422(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUV422Row)(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) = - UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -363,10 +576,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_uyvy == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -394,15 +606,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, #if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; - if (width >= 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_NEON; - } + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUV422Row = UYVYToUV422Row_NEON; } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -415,13 +635,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, return 0; } +// Convert YUY2 to Y. +LIBYUV_API +int YUY2ToY(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + // Mirror I400 with optional flipping LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Mirror(const uint8* src_y, + int src_stride_y, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -437,17 +726,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y, // Mirror I420 with optional flipping LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Mirror(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -472,9 +768,12 @@ int I420Mirror(const uint8* src_y, int src_stride_y, // ARGB mirror. LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMirror(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = ARGBMirrorRow_C; @@ -511,6 +810,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -544,10 +851,14 @@ ARGBBlendRow GetARGBBlend() { // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBBlend(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width) = GetARGBBlend(); @@ -561,8 +872,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -580,14 +890,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, // Alpha Blend plane and store to destination. LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int BlendPlane(const uint8* src_y0, + int src_stride_y0, + const uint8* src_y1, + int src_stride_y1, + const uint8* alpha, + int alpha_stride, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + const uint8* alpha, uint8* dst, int width) = + BlendPlaneRow_C; if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { return -1; } @@ -599,10 +915,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, } // Coalesce rows for Y plane. - if (src_stride_y0 == width && - src_stride_y1 == width && - alpha_stride == width && - dst_stride_y == width) { + if (src_stride_y0 == width && src_stride_y1 == width && + alpha_stride == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; @@ -610,7 +924,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { BlendPlaneRow = BlendPlaneRow_SSSE3; } @@ -618,7 +932,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #endif #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - BlendPlaneRow = BlendPlaneRow_Any_AVX2; + BlendPlaneRow = BlendPlaneRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } @@ -638,22 +952,34 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #define MAXTWIDTH 2048 // Alpha Blend YUV images and store to destination. LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Blend(const uint8* src_y0, + int src_stride_y0, + const uint8* src_u0, + int src_stride_u0, + const uint8* src_v0, + int src_stride_v0, + const uint8* src_y1, + int src_stride_y1, + const uint8* src_u1, + int src_stride_u1, + const uint8* src_v1, + int src_stride_v1, + const uint8* alpha, + int alpha_stride, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height) { int y; // Half width/height for UV. int halfwidth = (width + 1) >> 1; void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + const uint8* alpha, uint8* dst, int width) = + BlendPlaneRow_C; void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || @@ -669,11 +995,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, } // Blend Y plane. - BlendPlane(src_y0, src_stride_y0, - src_y1, src_stride_y1, - alpha, alpha_stride, - dst_y, dst_stride_y, - width, height); + BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, + dst_y, dst_stride_y, width, height); #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -753,10 +1076,14 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, // Multiply 2 ARGB images and store to destination. LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMultiply(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBMultiplyRow_C; @@ -770,8 +1097,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -801,6 +1127,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -814,10 +1148,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, // Add 2 ARGB images and store to destination. LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAdd(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBAddRow_C; @@ -831,8 +1169,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -867,6 +1204,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -880,10 +1225,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, // Subtract 2 ARGB images and store to destination. LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBSubtract(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBSubtractRow_C; @@ -897,8 +1246,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -928,6 +1276,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -939,21 +1295,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, return 0; } // Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I422ToRGBAMatrix(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -995,6 +1353,14 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -1008,48 +1374,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to RGBA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I422ToRGBA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I422 to BGRA. LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I422ToBGRA(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert NV12 to RGB565. LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int NV12ToRGB565(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*NV12ToRGB565Row)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || - width <= 0 || height == 0) { + void (*NV12ToRGB565Row)( + const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1082,6 +1455,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); @@ -1096,14 +1477,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // Convert RAW to RGB24. LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int RAWToRGB24(const uint8* src_raw, + int src_stride_raw, + uint8* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) = RAWToRGB24Row_C; - if (!src_raw || !dst_rgb24 || - width <= 0 || height == 0) { + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1113,8 +1496,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_rgb24 == width * 3) { + if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_raw = dst_stride_rgb24 = 0; @@ -1135,6 +1517,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -1145,11 +1535,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, +void SetPlane(uint8* dst_y, + int dst_stride_y, + int width, + int height, uint32 value) { int y; - void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C; + void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1192,22 +1584,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y, // Draw a rectangle into I420 LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, - int width, int height, - int value_y, int value_u, int value_v) { +int I420Rect(uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; uint8* start_y = dst_y + y * dst_stride_y + x; uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); - if (!dst_y || !dst_u || !dst_v || - width <= 0 || height == 0 || - x < 0 || y < 0 || - value_y < 0 || value_y > 255 || - value_u < 0 || value_u > 255 || + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || + y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { return -1; } @@ -1220,15 +1616,16 @@ int I420Rect(uint8* dst_y, int dst_stride_y, // Draw a rectangle into ARGB LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height, +int ARGBRect(uint8* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, uint32 value) { int y; - void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C; - if (!dst_argb || - width <= 0 || height == 0 || - dst_x < 0 || dst_y < 0) { + void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C; + if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } if (height < 0) { @@ -1257,6 +1654,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ARGBSetRow = ARGBSetRow_X86; } #endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1280,12 +1685,15 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, // f is foreground pixel premultiplied by alpha LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAttenuate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBAttenuateRow_C; + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBAttenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1295,8 +1703,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1325,6 +1732,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -1336,9 +1751,12 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBUnattenuate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBUnattenuateRow_C; @@ -1351,8 +1769,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1373,7 +1790,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, } } #endif -// TODO(fbarchard): Neon version. + // TODO(fbarchard): Neon version. for (y = 0; y < height; ++y) { ARGBUnattenuateRow(src_argb, dst_argb, width); @@ -1385,12 +1802,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, // Convert ARGB to Grayed ARGB. LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBGrayTo(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBGrayRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1400,8 +1820,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1416,6 +1835,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -1427,12 +1851,15 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, // Make a rectangle of ARGB gray scale. LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height) { +int ARGBGray(uint8* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBGrayRow_C; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -1453,6 +1880,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); dst += dst_stride_argb; @@ -1462,10 +1895,14 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, // Make a rectangle of ARGB Sepia tone. LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, int width, int height) { +int ARGBSepia(uint8* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; + void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -1486,6 +1923,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, ARGBSepiaRow = ARGBSepiaRow_NEON; } #endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); dst += dst_stride_argb; @@ -1496,13 +1939,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, // Apply a 4x4 matrix to each ARGB pixel. // Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBColorMatrix(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const int8* matrix_argb, - int width, int height) { + int width, + int height) { int y; void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + const int8* matrix_argb, int width) = + ARGBColorMatrixRow_C; if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } @@ -1512,8 +1959,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1539,13 +1985,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, // Apply a 4x3 matrix to each ARGB pixel. // Deprecated. LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, +int RGBColorMatrix(uint8* dst_argb, + int dst_stride_argb, const int8* matrix_rgb, - int dst_x, int dst_y, int width, int height) { + int dst_x, + int dst_y, + int width, + int height) { SIMD_ALIGNED(int8 matrix_argb[16]); uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } @@ -1565,23 +2015,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; matrix_argb[15] = 64; // 1.0 - return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, - dst, dst_stride_argb, - &matrix_argb[0], width, height); + return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst, + dst_stride_argb, &matrix_argb[0], width, height); } // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, +int ARGBColorTable(uint8* dst_argb, + int dst_stride_argb, const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb, int width) = ARGBColorTableRow_C; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1605,15 +2058,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, +int RGBColorTable(uint8* dst_argb, + int dst_stride_argb, const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb, int width) = RGBColorTableRow_C; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1644,11 +2101,17 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb, // Caveat - although SSE2 saturates, the C function does not and should be used // with care if doing anything but quantization. LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int dst_x, int dst_y, int width, int height) { +int ARGBQuantize(uint8* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || @@ -1681,12 +2144,16 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height) { +int ARGBComputeCumulativeSum(const uint8* src_argb, + int src_stride_argb, + int32* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height) { int y; void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + const int32* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; int32* previous_cumsum = dst_cumsum; if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; @@ -1711,15 +2178,22 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius) { +int ARGBBlur(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int32* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius) { int y; - void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; + int width, int area, uint8* dst, + int count) = CumulativeSumToAverageRow_C; int32* cumsum_bot_row; int32* max_cumsum_bot_row; int32* cumsum_top_row; @@ -1749,9 +2223,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, #endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. - ARGBComputeCumulativeSum(src_argb, src_stride_argb, - dst_cumsum, dst_stride32_cumsum, - width, radius); + ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, + dst_stride32_cumsum, width, radius); src_argb = src_argb + radius * src_stride_argb; cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; @@ -1789,24 +2262,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Left clipped. for (x = 0; x < radius + 1; ++x) { - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], 1); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } // Middle unclipped. n = (width - 1) - radius - x + 1; - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], n); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, - boxwidth, area, &dst_argb[x * 4], 1); + cumsum_bot_row + (x - radius - 1) * 4, boxwidth, + area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } @@ -1815,12 +2288,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Multiply ARGB image by a specified ARGB value. LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value) { +int ARGBShade(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32 value) { int y; - void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, - int width, uint32 value) = ARGBShadeRow_C; + void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) = ARGBShadeRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { return -1; } @@ -1830,8 +2307,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1846,6 +2322,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ARGBShadeRow = ARGBShadeRow_NEON; } #endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -1857,12 +2338,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, // Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation) { +int InterpolatePlane(const uint8* src0, + int src_stride0, + const uint8* src1, + int src_stride1, + uint8* dst, + int dst_stride, + int width, + int height, + int interpolation) { int y; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -1875,9 +2361,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0, dst_stride = -dst_stride; } // Coalesce rows. - if (src_stride0 == width && - src_stride1 == width && - dst_stride == width) { + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { width *= height; height = 1; src_stride0 = src_stride1 = dst_stride = 0; @@ -1907,14 +2391,21 @@ int InterpolatePlane(const uint8* src0, int src_stride0, } #endif #if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) && - IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) && - IS_ALIGNED(width, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) && + IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) && + IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) && + IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) { InterpolateRow = InterpolateRow_DSPR2; } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -1927,61 +2418,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0, // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation) { - return InterpolatePlane(src_argb0, src_stride_argb0, - src_argb1, src_stride_argb1, - dst_argb, dst_stride_argb, +int ARGBInterpolate(const uint8* src_argb0, + int src_stride_argb0, + const uint8* src_argb1, + int src_stride_argb1, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, + src_stride_argb1, dst_argb, dst_stride_argb, width * 4, height, interpolation); } // Interpolate 2 YUV images by specified amount (0 to 255). LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation) { +int I420Interpolate(const uint8* src0_y, + int src0_stride_y, + const uint8* src0_u, + int src0_stride_u, + const uint8* src0_v, + int src0_stride_v, + const uint8* src1_y, + int src1_stride_y, + const uint8* src1_u, + int src1_stride_u, + const uint8* src1_v, + int src1_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src0_y || !src0_u || !src0_v || - !src1_y || !src1_u || !src1_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } - InterpolatePlane(src0_y, src0_stride_y, - src1_y, src1_stride_y, - dst_y, dst_stride_y, - width, height, interpolation); - InterpolatePlane(src0_u, src0_stride_u, - src1_u, src1_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight, interpolation); - InterpolatePlane(src0_v, src0_stride_v, - src1_v, src1_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight, interpolation); + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, + dst_stride_y, width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, + dst_stride_u, halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, + dst_stride_v, halfwidth, halfheight, interpolation); return 0; } // Shuffle ARGB channel order. e.g. BGRA to ARGB. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height) { +int ARGBShuffle(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_argb, + int dst_stride_argb, + const uint8* shuffler, + int width, + int height) { int y; void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, const uint8* shuffler, int width) = ARGBShuffleRow_C; - if (!src_bgra || !dst_argb || - width <= 0 || height == 0) { + if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1991,8 +2492,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_stride_bgra = -src_stride_bgra; } // Coalesce rows. - if (src_stride_bgra == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_bgra = dst_stride_argb = 0; @@ -2029,6 +2529,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -2039,28 +2547,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } // Sobel ARGB effect. -static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, +static int ARGBSobelize(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst, int width)) { + uint8* dst, + int width)) { int y; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) = ARGBToYJRow_C; - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) = SobelYRow_C; + void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, + int width) = SobelYRow_C; void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobely, int width) = SobelXRow_C; const int kEdge = 16; // Extra pixels at start of row for extrude/align. - if (!src_argb || !dst_argb || width <= 0 || height == 0) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } @@ -2088,6 +2600,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2159,9 +2679,12 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBSobel(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) @@ -2180,15 +2703,26 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); } // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int ARGBSobelToPlane(const uint8* src_argb, + int src_stride_argb, + uint8* dst_y, + int dst_stride_y, + int width, + int height) { void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) @@ -2207,16 +2741,27 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, } } #endif - return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, - width, height, SobelToPlaneRow); +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, + height, SobelToPlaneRow); } // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBSobelXY(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) @@ -2235,32 +2780,41 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); } // Apply a 4x4 polynomial to each ARGB pixel. LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBPolynomial(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const float* poly, - int width, int height) { + int width, + int height) { int y; - void (*ARGBPolynomialRow)(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) = ARGBPolynomialRow_C; + void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb, + const float* poly, int width) = ARGBPolynomialRow_C; if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2285,28 +2839,103 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16* src_y, + int src_stride_y, + uint16* dst_y, + int dst_stride_y, + float scale, + int width, + int height) { + int y; + void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) = + HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + HalfFloatRow = HalfFloatRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_SSE2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; + } + } +#endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBLumaColorTable(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, const uint8* luma, - int width, int height) { + int width, + int height) { int y; - void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, - int width, const uint8* luma, const uint32 lumacoeff) = - ARGBLumaColorTableRow_C; + void (*ARGBLumaColorTableRow)( + const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, + const uint32 lumacoeff) = ARGBLumaColorTableRow_C; if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2327,9 +2956,12 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, // Copy Alpha from one ARGB image to another. LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyAlpha(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBCopyAlphaRow_C; @@ -2343,8 +2975,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2376,9 +3007,12 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, // Extract just the alpha channel from ARGB. LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride, - uint8* dst_a, int dst_stride, - int width, int height) { +int ARGBExtractAlpha(const uint8* src_argb, + int src_stride, + uint8* dst_a, + int dst_stride, + int width, + int height) { if (!src_argb || !dst_a || width <= 0 || height == 0) { return -1; } @@ -2394,7 +3028,7 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride, height = 1; src_stride = dst_stride = 0; } - void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) = + void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) = ARGBExtractAlphaRow_C; #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2402,6 +3036,12 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride, : ARGBExtractAlphaRow_Any_SSE2; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON @@ -2419,9 +3059,12 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride, // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyYToAlpha(const uint8* src_y, + int src_stride_y, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = ARGBCopyYToAlphaRow_C; @@ -2435,8 +3078,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -2470,20 +3112,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, // directly. A SplitUVRow_Odd function could copy the remaining chroma. LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int YUY2ToNV12(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_yuy2 || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2540,6 +3184,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; @@ -2568,20 +3220,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int UYVYToNV12(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_y, + int dst_stride_y, + uint8* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_uyvy || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2638,6 +3292,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; diff --git a/files/source/rotate.cc b/files/source/rotate.cc index 01ea5c40..277c53b2 100644 --- a/files/source/rotate.cc +++ b/files/source/rotate.cc @@ -22,12 +22,20 @@ extern "C" { #endif LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposePlane(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { int i = height; - void (*TransposeWx8)(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSEWX16_MSA) + void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst, + int dst_stride, int width) = TransposeWx16_C; +#else + void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst, + int dst_stride, int width) = TransposeWx8_C; +#endif #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; @@ -51,22 +59,40 @@ void TransposePlane(const uint8* src, int src_stride, #endif #if defined(HAS_TRANSPOSEWX8_DSPR2) if (TestCpuFlag(kCpuHasDSPR2)) { - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) && + IS_ALIGNED(src_stride, 4)) { TransposeWx8 = TransposeWx8_Fast_DSPR2; } else { TransposeWx8 = TransposeWx8_DSPR2; } } #endif +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; + } + } +#endif +#if defined(HAS_TRANSPOSEWX16_MSA) + // Work across the source in 16x16 tiles + while (i >= 16) { + TransposeWx16(src, src_stride, dst, dst_stride, width); + src += 16 * src_stride; // Go down 16 rows. + dst += 16; // Move over 16 columns. + i -= 16; + } +#else // Work across the source in 8x8 tiles while (i >= 8) { TransposeWx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_stride; // Go down 8 rows. - dst += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); @@ -74,9 +100,12 @@ void TransposePlane(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane90(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Rotate by 90 is a transpose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. @@ -86,9 +115,12 @@ void RotatePlane90(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane270(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Rotate by 270 is a transpose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. @@ -98,9 +130,12 @@ void RotatePlane270(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane180(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width); const uint8* src_bot = src + src_stride * (height - 1); @@ -135,12 +170,20 @@ void RotatePlane180(const uint8* src, int src_stride, #endif // TODO(fbarchard): Mirror on mips handle unaligned memory. #if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) && + IS_ALIGNED(dst_stride, 4)) { MirrorRow = MirrorRow_DSPR2; } #endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -181,15 +224,24 @@ void RotatePlane180(const uint8* src, int src_stride, } LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUV(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height) { int i = height; - void (*TransposeUVWx8)(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +#if defined(HAS_TRANSPOSEUVWX16_MSA) + void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a, + int dst_stride_a, uint8* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#else + void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a, + int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; +#endif #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; @@ -204,68 +256,92 @@ void TransposeUV(const uint8* src, int src_stride, } #endif #if defined(HAS_TRANSPOSEUVWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && + IS_ALIGNED(src_stride, 4)) { TransposeUVWx8 = TransposeUVWx8_DSPR2; } #endif +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } + } +#endif +#if defined(HAS_TRANSPOSEUVWX16_MSA) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#else // Work through the source in 8x8 tiles. while (i >= 8) { - TransposeUVWx8(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); - src += 8 * src_stride; // Go down 8 rows. - dst_a += 8; // Move over 8 columns. - dst_b += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { - TransposeUVWxH_C(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, i); } } LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV90(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height) { src += src_stride * (height - 1); src_stride = -src_stride; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV270(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height) { dst_a += dst_stride_a * (width - 1); dst_b += dst_stride_b * (width - 1); dst_stride_a = -dst_stride_a; dst_stride_b = -dst_stride_b; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } // Rotate 180 is a horizontal and vertical flip. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV180(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height) { int i; void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = MirrorUVRow_C; @@ -280,8 +356,8 @@ void RotateUV180(const uint8* src, int src_stride, } #endif #if defined(HAS_MIRRORUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && + IS_ALIGNED(src_stride, 4)) { MirrorUVRow = MirrorUVRow_DSPR2; } #endif @@ -298,9 +374,12 @@ void RotateUV180(const uint8* src, int src_stride, } LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height, +int RotatePlane(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height, enum RotationMode mode) { if (!src || width <= 0 || height == 0 || !dst) { return -1; @@ -316,24 +395,16 @@ int RotatePlane(const uint8* src, int src_stride, switch (mode) { case kRotate0: // copy frame - CopyPlane(src, src_stride, - dst, dst_stride, - width, height); + CopyPlane(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate90: - RotatePlane90(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane90(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate270: - RotatePlane270(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane270(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate180: - RotatePlane180(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane180(src, src_stride, dst, dst_stride, width, height); return 0; default: break; @@ -342,18 +413,25 @@ int RotatePlane(const uint8* src, int src_stride, } LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int I420Rotate(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { return -1; } @@ -372,45 +450,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return I420Copy(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane90(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane90(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane270(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane270(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane180(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane180(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; default: break; @@ -419,17 +481,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y, } LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int NV12ToI420Rotate(const uint8* src_y, + int src_stride_y, + const uint8* src_uv, + int src_stride_uv, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_uv || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || + !dst_v) { return -1; } @@ -446,38 +514,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return NV12ToI420(src_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV90(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV270(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV180(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; default: break; diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc index 31a74c31..562096b9 100644 --- a/files/source/rotate_any.cc +++ b/files/source/rotate_any.cc @@ -18,16 +18,16 @@ namespace libyuv { extern "C" { #endif -#define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst, int dst_stride, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ - } \ - TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\ - } +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } #ifdef HAS_TRANSPOSEWX8_NEON TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) @@ -41,22 +41,22 @@ TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #ifdef HAS_TRANSPOSEWX8_DSPR2 TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7) #endif +#ifdef HAS_TRANSPOSEWX16_MSA +TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) +#endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst_a, int dst_stride_a, \ - uint8* dst_b, int dst_stride_b, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \ - n); \ - } \ - TransposeUVWx8_C(src + n * 2, src_stride, \ - dst_a + n * dst_stride_a, dst_stride_a, \ - dst_b + n * dst_stride_b, dst_stride_b, r); \ - } + void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \ + int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) @@ -67,14 +67,12 @@ TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #ifdef HAS_TRANSPOSEUVWX8_DSPR2 TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) #endif +#ifdef HAS_TRANSPOSEUVWX16_MSA +TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) +#endif #undef TUVANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc index 787c0ad1..b458d8fa 100644 --- a/files/source/rotate_argb.cc +++ b/files/source/rotate_argb.cc @@ -22,29 +22,44 @@ extern "C" { // ARGBScale has a function to copy pixels to a row, striding each source // pixel by a constant. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(__native_client__)) || \ + defined(__i386__)) #define HAS_SCALEARGBROWDOWNEVEN_SSE2 -void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, + int src_stride, + int src_stepx, + uint8* dst_ptr, + int dst_width); #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEARGBROWDOWNEVEN_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, + int src_stride, + int src_stepx, + uint8* dst_ptr, + int dst_width); #endif -void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, - int src_stepx, uint8* dst_ptr, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_ptr, + int, + int src_stepx, + uint8* dst_ptr, + int dst_width); -static void ARGBTranspose(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +static void ARGBTranspose(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { int i; int src_pixel_step = src_stride >> 2; void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, - int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; + int src_step, uint8* dst_ptr, int dst_width) = + ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; @@ -63,8 +78,12 @@ static void ARGBTranspose(const uint8* src, int src_stride, } } -void ARGBRotate90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate90(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. @@ -73,8 +92,12 @@ void ARGBRotate90(const uint8* src, int src_stride, ARGBTranspose(src, src_stride, dst, dst_stride, width, height); } -void ARGBRotate270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate270(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. @@ -83,8 +106,12 @@ void ARGBRotate270(const uint8* src, int src_stride, ARGBTranspose(src, src_stride, dst, dst_stride, width, height); } -void ARGBRotate180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate180(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); const uint8* src_bot = src + src_stride * (height - 1); @@ -118,6 +145,14 @@ void ARGBRotate180(const uint8* src, int src_stride, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -146,9 +181,9 @@ void ARGBRotate180(const uint8* src, int src_stride, // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - ARGBMirrorRow(src, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src, row, width); // Mirror first row into a buffer ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row - CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last src += src_stride; dst += dst_stride; src_bot -= src_stride; @@ -158,8 +193,12 @@ void ARGBRotate180(const uint8* src, int src_stride, } LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, int width, int height, +int ARGBRotate(const uint8* src_argb, + int src_stride_argb, + uint8* dst_argb, + int dst_stride_argb, + int width, + int height, enum RotationMode mode) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) { return -1; @@ -175,23 +214,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb, switch (mode) { case kRotate0: // copy frame - return ARGBCopy(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, + return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: - ARGBRotate90(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate270: - ARGBRotate270(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate180: - ARGBRotate180(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; default: break; diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc index b33a9a0c..cdd231fa 100644 --- a/files/source/rotate_common.cc +++ b/files/source/rotate_common.cc @@ -16,8 +16,11 @@ namespace libyuv { extern "C" { #endif -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +void TransposeWx8_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { int i; for (i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; @@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride, } } -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { +void TransposeUVWx8_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width) { int i; for (i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; @@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride, } } -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposeWxH_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width, + int height) { int i; for (i = 0; i < width; ++i) { int j; @@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride, } } -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUVWxH_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width, + int height) { int i; for (i = 0; i < width * 2; i += 2) { int j; diff --git a/files/source/rotate_mips.cc b/files/source/rotate_dspr2.cc index 1e8ce251..2dce9107 100644 --- a/files/source/rotate_mips.cc +++ b/files/source/rotate_dspr2.cc @@ -18,18 +18,20 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( +#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) + +void TransposeWx8_DSPR2(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" - "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 "addu $t3, $t2, %[src_stride] \n" "addu $t5, $t4, %[src_stride] \n" "addu $t6, $t2, $t4 \n" @@ -38,8 +40,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride, "or $t0, $t0, $t1 \n" "bnez $t0, 11f \n" " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned - "1: \n" + // dst + dst_stride word aligned + "1: \n" "lbu $t0, 0(%[src]) \n" "lbux $t1, %[src_stride](%[src]) \n" "lbux $t8, $t2(%[src]) \n" @@ -65,8 +67,8 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride, "bnez %[width], 1b \n" " addu %[dst], %[dst], %[dst_stride] \n" "b 2f \n" -//dst + dst_stride unaligned - "11: \n" + // dst + dst_stride unaligned + "11: \n" "lbu $t0, 0(%[src]) \n" "lbux $t1, %[src_stride](%[src]) \n" "lbux $t8, $t2(%[src]) \n" @@ -92,23 +94,20 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride, "swr $s1, 4(%[dst]) \n" "swl $s1, 7(%[dst]) \n" "bnez %[width], 11b \n" - "addu %[dst], %[dst], %[dst_stride] \n" - "2: \n" + "addu %[dst], %[dst], %[dst_stride] \n" + "2: \n" ".set pop \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1" - ); + : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) + : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1"); } -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( +void TransposeWx8_Fast_DSPR2(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + __asm__ __volatile__( ".set noat \n" ".set push \n" ".set noreorder \n" @@ -126,67 +125,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, "or $t0, $t0, $t1 \n" "bnez $t0, 11f \n" " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned + // dst + dst_stride word aligned "1: \n" "lw $t0, 0(%[src]) \n" "lwx $t1, %[src_stride](%[src]) \n" "lwx $t8, $t2(%[src]) \n" "lwx $t9, $t3(%[src]) \n" -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | + // t0 = | 30 | 20 | 10 | 00 | + // t1 = | 31 | 21 | 11 | 01 | + // t8 = | 32 | 22 | 12 | 02 | + // t9 = | 33 | 23 | 13 | 03 | "precr.qb.ph $s0, $t1, $t0 \n" "precr.qb.ph $s1, $t9, $t8 \n" "precrq.qb.ph $s2, $t1, $t0 \n" "precrq.qb.ph $s3, $t9, $t8 \n" - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | "precr.qb.ph $s4, $s1, $s0 \n" "precrq.qb.ph $s5, $s1, $s0 \n" "precr.qb.ph $s6, $s3, $s2 \n" "precrq.qb.ph $s7, $s3, $s2 \n" - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | "lwx $t0, $t4(%[src]) \n" "lwx $t1, $t5(%[src]) \n" "lwx $t8, $t6(%[src]) \n" "lwx $t9, $t7(%[src]) \n" -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | + // t0 = | 34 | 24 | 14 | 04 | + // t1 = | 35 | 25 | 15 | 05 | + // t8 = | 36 | 26 | 16 | 06 | + // t9 = | 37 | 27 | 17 | 07 | "precr.qb.ph $s0, $t1, $t0 \n" "precr.qb.ph $s1, $t9, $t8 \n" "precrq.qb.ph $s2, $t1, $t0 \n" "precrq.qb.ph $s3, $t9, $t8 \n" - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | "precr.qb.ph $t0, $s1, $s0 \n" "precrq.qb.ph $t1, $s1, $s0 \n" "precr.qb.ph $t8, $s3, $s2 \n" "precrq.qb.ph $t9, $s3, $s2 \n" - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | "addu $s0, %[dst], %[dst_stride] \n" "addu $s1, $s0, %[dst_stride] \n" @@ -207,67 +206,67 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, "bnez $AT, 1b \n" " addu %[dst], $s2, %[dst_stride] \n" "b 2f \n" -//dst + dst_stride unaligned + // dst + dst_stride unaligned "11: \n" "lw $t0, 0(%[src]) \n" "lwx $t1, %[src_stride](%[src]) \n" "lwx $t8, $t2(%[src]) \n" "lwx $t9, $t3(%[src]) \n" -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | + // t0 = | 30 | 20 | 10 | 00 | + // t1 = | 31 | 21 | 11 | 01 | + // t8 = | 32 | 22 | 12 | 02 | + // t9 = | 33 | 23 | 13 | 03 | "precr.qb.ph $s0, $t1, $t0 \n" "precr.qb.ph $s1, $t9, $t8 \n" "precrq.qb.ph $s2, $t1, $t0 \n" "precrq.qb.ph $s3, $t9, $t8 \n" - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | "precr.qb.ph $s4, $s1, $s0 \n" "precrq.qb.ph $s5, $s1, $s0 \n" "precr.qb.ph $s6, $s3, $s2 \n" "precrq.qb.ph $s7, $s3, $s2 \n" - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | "lwx $t0, $t4(%[src]) \n" "lwx $t1, $t5(%[src]) \n" "lwx $t8, $t6(%[src]) \n" "lwx $t9, $t7(%[src]) \n" -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | + // t0 = | 34 | 24 | 14 | 04 | + // t1 = | 35 | 25 | 15 | 05 | + // t8 = | 36 | 26 | 16 | 06 | + // t9 = | 37 | 27 | 17 | 07 | "precr.qb.ph $s0, $t1, $t0 \n" "precr.qb.ph $s1, $t9, $t8 \n" "precrq.qb.ph $s2, $t1, $t0 \n" "precrq.qb.ph $s3, $t9, $t8 \n" - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | "precr.qb.ph $t0, $s1, $s0 \n" "precrq.qb.ph $t1, $s1, $s0 \n" "precr.qb.ph $t8, $s3, $s2 \n" "precrq.qb.ph $t9, $s3, $s2 \n" - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | "addu $s0, %[dst], %[dst_stride] \n" "addu $s1, $s0, %[dst_stride] \n" @@ -298,34 +297,33 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, "2: \n" ".set pop \n" ".set at \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" - ); + : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) + : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", + "s2", "s3", "s4", "s5", "s6", "s7"); } -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_DSPR2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, int width) { - __asm__ __volatile__ ( + __asm__ __volatile__( ".set push \n" ".set noreorder \n" "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 "addu $t3, $t2, %[src_stride] \n" "addu $t5, $t4, %[src_stride] \n" "addu $t6, $t2, $t4 \n" "subu $t7, $t9, %[src_stride] \n" "srl $t1, %[width], 1 \n" -// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b + // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b "andi $t0, %[dst_a], 0x3 \n" "andi $t8, %[dst_b], 0x3 \n" "or $t0, $t0, $t8 \n" @@ -335,52 +333,52 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, "or $t0, $t0, $t8 \n" "bnez $t0, 11f \n" " nop \n" -// dst + dst_stride word aligned (both, a & b dst addresses) - "1: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + // dst + dst_stride word aligned (both, a & b dst addresses) + "1: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| "addu $s6, %[dst_b], %[dst_stride_b] \n" - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| "sw $s3, 0($s5) \n" "sw $s4, 0($s6) \n" - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| "sw $s3, 0(%[dst_a]) \n" "sw $s4, 0(%[dst_b]) \n" - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| "sw $s3, 4($s5) \n" "sw $s4, 4($s6) \n" - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| "addiu %[src], 4 \n" "addiu $t1, -1 \n" @@ -394,59 +392,59 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, "b 2f \n" " nop \n" -// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned - "11: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned + "11: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| "addu $s6, %[dst_b], %[dst_stride_b] \n" - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| "swr $s3, 0($s5) \n" "swl $s3, 3($s5) \n" "swr $s4, 0($s6) \n" "swl $s4, 3($s6) \n" - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| "swr $s3, 0(%[dst_a]) \n" "swl $s3, 3(%[dst_a]) \n" "swr $s4, 0(%[dst_b]) \n" "swl $s4, 3(%[dst_b]) \n" - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| "swr $s3, 4($s5) \n" "swl $s3, 7($s5) \n" "swr $s4, 4($s6) \n" "swl $s4, 7($s6) \n" - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| "addiu %[src], 4 \n" "addiu $t1, -1 \n" @@ -462,18 +460,11 @@ void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, "2: \n" ".set pop \n" - : [src] "+r" (src), - [dst_a] "+r" (dst_a), - [dst_b] "+r" (dst_b), - [width] "+r" (width), - [src_stride] "+r" (src_stride) - : [dst_stride_a] "r" (dst_stride_a), - [dst_stride_b] "r" (dst_stride_b) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); + : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b), + [width] "+r"(width), [src_stride] "+r"(src_stride) + : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", + "s2", "s3", "s4", "s5", "s6"); } #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc index cbe870ca..85b41dd8 100644 --- a/files/source/rotate_gcc.cc +++ b/files/source/rotate_gcc.cc @@ -22,342 +22,348 @@ extern "C" { // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void TransposeWx8_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" - ); +void TransposeWx8_Fast_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); } #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. #if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9" - ); +void TransposeUVWx8_SSE2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) #endif // defined(__x86_64__) || defined(__i386__) diff --git a/files/source/rotate_msa.cc b/files/source/rotate_msa.cc new file mode 100644 index 00000000..8907765a --- /dev/null +++ b/files/source/rotate_msa.cc @@ -0,0 +1,250 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ + out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ + out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ + out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ + } + +#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ + out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ + out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ + out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ + } + +#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ + out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ + out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ + out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ + } + +#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ + out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ + } + +void TransposeWx16_C(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_MSA(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { + int x; + const uint8* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 16) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + src += 16; + dst += dst_stride * 4; + } +} + +void TransposeUVWx16_MSA(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int width) { + int x; + const uint8* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 8) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + src += 16; + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc index 1c22b472..41ec34ec 100644 --- a/files/source/rotate_neon.cc +++ b/files/source/rotate_neon.cc @@ -21,11 +21,13 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; +static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, +void TransposeWx8_NEON(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, int width) { const uint8* src_temp; asm volatile ( @@ -240,12 +242,15 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ); } -static uvec8 kVTbl4x4TransposeDi = - { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; +static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, int width) { const uint8* src_temp; asm volatile ( diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc index 1ab448f3..3cf17930 100644 --- a/files/source/rotate_neon64.cc +++ b/files/source/rotate_neon64.cc @@ -21,13 +21,16 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; - -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void TransposeWx8_NEON(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. + int64 width64 = (int64)width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter @@ -247,16 +250,19 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ); } -static uint8 kVTbl4x4TransposeDi[32] = - { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, - 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; +static uint8 kVTbl4x4TransposeDi[32] = { + 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, int width) { const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. + int64 width64 = (int64)width; // Work around clang 3.4 warning. asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc index 1300fc0f..201643e7 100644 --- a/files/source/rotate_win.cc +++ b/files/source/rotate_win.cc @@ -19,15 +19,17 @@ extern "C" { // This module is for 32 bit Visual C x86 and clangcl #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) -__declspec(naked) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +__declspec(naked) void TransposeWx8_SSSE3(const uint8* src, + int src_stride, + uint8* dst, + int dst_stride, + int width) { __asm { push edi push esi push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width @@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, } } -__declspec(naked) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w) { +__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, + int src_stride, + uint8* dst_a, + int dst_stride_a, + uint8* dst_b, + int dst_stride_b, + int w) { __asm { push ebx push esi push edi push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride mov edx, [esp + 16 + 12] // dst_a mov esi, [esp + 16 + 16] // dst_stride_a mov ebx, [esp + 16 + 20] // dst_b @@ -134,8 +138,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, align 4 convertloop: - // Read in the data from the source pointer. - // First round of bit swap. + // Read in the data from the source pointer. + // First round of bit swap. movdqu xmm0, [eax] movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] @@ -162,7 +166,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea eax, [eax + 2 * edi] movdqu [esp], xmm5 // backup xmm5 neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. + movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 punpckhbw xmm5, xmm7 movdqa xmm7, xmm5 @@ -183,7 +187,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, movdqa xmm6, xmm5 movdqu xmm5, [esp] // restore xmm5 movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. + movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 @@ -200,7 +204,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm4 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. + movdqa xmm0, xmm2 // use xmm0 as the temp register. punpckldq xmm2, xmm6 movlpd qword ptr [edx], xmm2 movhpd qword ptr [ebx], xmm2 @@ -209,7 +213,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. + movdqa xmm0, xmm1 // use xmm0 as the temp register. punpckldq xmm1, xmm5 movlpd qword ptr [edx], xmm1 movhpd qword ptr [ebx], xmm1 @@ -218,7 +222,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. + movdqa xmm0, xmm3 // use xmm0 as the temp register. punpckldq xmm3, xmm7 movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 diff --git a/files/source/row_any.cc b/files/source/row_any.cc index 494164fd..74a6621f 100644 --- a/files/source/row_any.cc +++ b/files/source/row_any.cc @@ -23,26 +23,26 @@ extern "C" { #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) // Any 4 planes to 1 with yuvconstants -#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - const uint8* a_buf, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + const uint8* a_buf, uint8* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) @@ -53,26 +53,29 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif #undef ANY41C // Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) @@ -80,9 +83,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif @@ -95,35 +104,31 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif -#ifdef HAS_I411TOARGBROW_SSSE3 -ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) -#endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) @@ -144,9 +149,6 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_AVX2 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_I411TOARGBROW_AVX2 -ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) -#endif #ifdef HAS_I422TOARGB4444ROW_AVX2 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) #endif @@ -159,32 +161,46 @@ ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) -ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_DSPR2 +ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7) +ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) +ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) +#endif #undef ANY31C // Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } // Merge functions. #ifdef HAS_MERGEUVROW_SSE2 @@ -196,6 +212,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif +#ifdef HAS_MERGEUVROW_MSA +ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 @@ -225,44 +244,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBSUBTRACTROW_NEON ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants -#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } // Biplanar to RGB. #ifdef HAS_NV12TOARGBROW_SSSE3 @@ -274,6 +310,12 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV12TOARGBROW_NEON ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_DSPR2 +ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -283,6 +325,9 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV21TOARGBROW_NEON ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif @@ -292,22 +337,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #ifdef HAS_NV12TORGB565ROW_NEON ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif #undef ANY21C // Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) @@ -372,9 +420,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) +ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif +#if defined(HAS_RAWTORGB24ROW_MSA) +ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) +#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif @@ -403,30 +463,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYROW_MSA +ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYJROW_MSA +ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_BGRATOYROW_MSA +ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ABGRTOYROW_MSA +ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_RGBATOYROW_MSA +ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RGB24TOYROW_MSA +ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RAWTOYROW_MSA +ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_RGB565TOYROW_MSA +ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_ARGB1555TOYROW_MSA +ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif @@ -434,23 +521,71 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_UYVYTOYROW_NEON -ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RGB24TOARGBROW_MSA +ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RAWTOARGBROW_MSA +ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_RGB565TOARGBROW_MSA +ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_ARGB1555TOARGBROW_MSA +ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_RGB24TOARGBROW_DSPR2 +ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7) +#endif +#ifdef HAS_RAWTOARGBROW_DSPR2 +ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7) +#endif +#ifdef HAS_RGB565TOARGBROW_DSPR2 +ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB1555TOARGBROW_DSPR2 +ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_DSPR2 +ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7) +#endif +#ifdef HAS_BGRATOYROW_DSPR2 +ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGBTOYROW_DSPR2 +ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7) +#endif +#ifdef HAS_ABGRTOYROW_DSPR2 +ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7) +#endif +#ifdef HAS_RGBATOYROW_DSPR2 +ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_MSA +ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -466,29 +601,35 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_NEON ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_MSA +ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. -#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) @@ -505,32 +646,51 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) #undef ANY11B // Any 1 to 1 with parameter. -#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - T shuffler, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ - } +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T shuffler, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, - const uint32, 4, 2, 3) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, + ARGBToRGB565DitherRow_SSE2, + const uint32, + 4, + 2, + 3) #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) -ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, + ARGBToRGB565DitherRow_AVX2, + const uint32, + 4, + 2, + 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) -ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, + ARGBToRGB565DitherRow_NEON, + const uint32, + 4, + 2, + 7) +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) +ANY11P(ARGBToRGB565DitherRow_Any_MSA, + ARGBToRGB565DitherRow_MSA, + const uint32, + 4, + 2, + 7) #endif #ifdef HAS_ARGBSHUFFLEROW_SSE2 ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) @@ -544,23 +704,58 @@ ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) #ifdef HAS_ARGBSHUFFLEROW_NEON ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) #endif +#ifdef HAS_ARGBSHUFFLEROW_MSA +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7) +#endif #undef ANY11P +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T shuffler, \ + int width) { \ + SIMD_ALIGNED(uint16 temp[32 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_SSE2 +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7) +#endif +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) +#endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15) +ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15) +#endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7) +ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7) +#endif +#undef ANY11P16 + // Any 1 to 1 with yuvconstants -#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #if defined(HAS_YUY2TOARGBROW_SSSE3) ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) @@ -573,25 +768,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #endif +#if defined(HAS_YUY2TOARGBROW_MSA) +ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) +#endif #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, \ - int source_y_fraction) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_INTERPOLATEROW_AVX2 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) @@ -605,22 +803,25 @@ ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #ifdef HAS_INTERPOLATEROW_DSPR2 ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3) #endif +#ifdef HAS_INTERPOLATEROW_MSA +ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) +#endif #undef ANY11T // Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr, r * BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ - } +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr, r* BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ + } #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) @@ -631,6 +832,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #endif +#ifdef HAS_MIRRORROW_MSA +ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -640,20 +844,23 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) #endif +#ifdef HAS_ARGBMIRRORROW_MSA +ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) +#endif #undef ANY11M // Any 1 plane. (memset) -#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8 temp[64]); \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, v32, n); \ - } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ - } +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8 temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + } #ifdef HAS_SETROW_X86 ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) @@ -664,43 +871,26 @@ ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) #ifdef HAS_ARGBSETROW_NEON ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) #endif +#ifdef HAS_ARGBSETROW_MSA +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3) +#endif #undef ANY1 // Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\ - SIMD_ALIGNED(uint8 temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - /* repeat last 4 bytes for 422 subsampler */ \ - if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - /* repeat last 4 - 12 bytes for 411 subsampler */ \ - if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \ - } \ - if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \ - } \ - if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ - } +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ + } #ifdef HAS_SPLITUVROW_SSE2 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) @@ -727,37 +917,41 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) -ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif #undef ANY12 // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. -#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \ - uint8* dst_u, uint8* dst_v, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 4]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ - memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ - } +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \ + uint8* dst_v, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + } #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) @@ -783,30 +977,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_MSA +ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_MSA +ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_MSA +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_MSA +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_MSA +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_MSA +ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_MSA +ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_MSA +ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_MSA +ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif @@ -816,6 +1037,24 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_DSPR2 +ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_DSPR2 +ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_DSPR2 +ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVROW_DSPR2 +ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif #undef ANY12S #ifdef __cplusplus diff --git a/files/source/row_common.cc b/files/source/row_common.cc index 32d2f686..bf953eef 100644 --- a/files/source/row_common.cc +++ b/files/source/row_common.cc @@ -40,7 +40,7 @@ static __inline uint32 Abs(int32 v) { int m = v >> 31; return (v + m) ^ m; } -#else // USE_BRANCHLESS +#else // USE_BRANCHLESS static __inline int32 clamp0(int32 v) { return (v < 0) ? 0 : v; } @@ -129,7 +129,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_C(const uint8* src_argb1555, + uint8* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { @@ -146,7 +147,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, } } -void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_C(const uint8* src_argb4444, + uint8* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { @@ -200,8 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 b1 = src_argb[4] >> 3; uint8 g1 = src_argb[5] >> 2; uint8 r1 = src_argb[6] >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } @@ -221,8 +223,10 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +void ARGBToRGB565DitherRow_C(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; @@ -233,8 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } @@ -258,9 +262,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 g1 = src_argb[5] >> 3; uint8 r1 = src_argb[6] >> 3; uint8 a1 = src_argb[7] >> 7; - *(uint32*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); dst_rgb += 4; src_argb += 8; } @@ -269,8 +272,7 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 g0 = src_argb[1] >> 3; uint8 r0 = src_argb[2] >> 3; uint8 a0 = src_argb[3] >> 7; - *(uint16*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); } } @@ -285,9 +287,8 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 g1 = src_argb[5] >> 4; uint8 r1 = src_argb[6] >> 4; uint8 a1 = src_argb[7] >> 4; - *(uint32*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | (a1 << 28); dst_rgb += 4; src_argb += 8; } @@ -296,13 +297,12 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 g0 = src_argb[1] >> 4; uint8 r0 = src_argb[2] >> 4; uint8 a0 = src_argb[3] >> 4; - *(uint16*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); } } static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; } static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { @@ -312,41 +312,45 @@ static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } -#define MAKEROWY(NAME, R, G, B, BPP) \ -void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ - src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ - uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ - src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ - uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ - src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ -} +// ARGBToY_C and ARGBToUV_C +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP]) >> \ + 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP]) >> \ + 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP]) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -382,7 +386,7 @@ MAKEROWY(RAW, 0, 1, 2, 3) // r 0.50000 * 255 = 127.5 = 127 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { - return (38 * r + 75 * g + 15 * b + 64) >> 7; + return (38 * r + 75 * g + 15 * b + 64) >> 7; } static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { @@ -394,41 +398,42 @@ static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { #define AVGB(a, b) (((a) + (b) + 1) >> 1) -#define MAKEROWYJ(NAME, R, G, B, BPP) \ -void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ -} +// ARGBToYJ_C and ARGBToUVJ_C +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ @@ -478,8 +483,11 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { } } -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { +void RGB565ToUVRow_C(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { @@ -525,8 +533,11 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, } } -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB1555ToUVRow_C(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { @@ -573,8 +584,11 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, } } -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB4444ToUVRow_C(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; int x; for (x = 0; x < width - 1; x += 2) { @@ -622,7 +636,9 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, } void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { int x; for (x = 0; x < width; ++x) { uint8 ab = src_argb[0]; @@ -636,41 +652,6 @@ void ARGBToUV444Row_C(const uint8* src_argb, } } -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb += 16; - dst_u += 1; - dst_v += 1; - } - // Odd width handling mimics 'any' function which replicates last pixel. - if ((width & 3) == 3) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 2) { - uint8 ab = (src_argb[0] + src_argb[4]) >> 1; - uint8 ag = (src_argb[1] + src_argb[5]) >> 1; - uint8 ar = (src_argb[2] + src_argb[6]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 1) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } -} - void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { @@ -702,22 +683,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_C(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = src_argb[0]; int g = src_argb[1]; int r = src_argb[2]; int a = src_argb[3]; - int sb = (b * matrix_argb[0] + g * matrix_argb[1] + - r * matrix_argb[2] + a * matrix_argb[3]) >> 6; - int sg = (b * matrix_argb[4] + g * matrix_argb[5] + - r * matrix_argb[6] + a * matrix_argb[7]) >> 6; - int sr = (b * matrix_argb[8] + g * matrix_argb[9] + - r * matrix_argb[10] + a * matrix_argb[11]) >> 6; - int sa = (b * matrix_argb[12] + g * matrix_argb[13] + - r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + + a * matrix_argb[3]) >> + 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + + a * matrix_argb[7]) >> + 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + + a * matrix_argb[11]) >> + 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + + a * matrix_argb[15]) >> + 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); @@ -757,8 +744,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } } -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_C(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -772,9 +762,11 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, } #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 24 +#define SHADE(f, v) v* f >> 24 -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_C(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value) { const uint32 b_scale = REPEAT8(value & 0xff); const uint32 g_scale = REPEAT8((value >> 8) & 0xff); @@ -799,10 +791,12 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, #undef SHADE #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 16 +#define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_C(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const uint32 b = REPEAT8(src_argb0[0]); @@ -827,8 +821,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_C(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -852,8 +848,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_C(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -876,8 +874,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, #undef SHADE // Sobel functions which mimics SSSE3. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width) { +void SobelXRow_C(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i]; @@ -894,8 +895,10 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, } } -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_C(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i + 0]; @@ -912,8 +915,10 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, } } -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; @@ -927,8 +932,10 @@ void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, } } -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; @@ -938,8 +945,10 @@ void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, } } -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_C(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; @@ -974,75 +983,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // B = (Y - 16) * 1.164 - U * -2.018 // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // U and V contributions to R,G,B. #define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ #define VR -102 /* round(-1.596 * 64) */ // Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -#if defined(__aarch64__) -const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -#elif defined(__arm__) -const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; +#define BR (VR * 128 + YGB) + +#if defined(__aarch64__) // 64 bit arm +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +#elif defined(__arm__) // 32 bit arm +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else -const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; -const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1062,74 +1065,68 @@ const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { // Y contribution to R,G,B. Scale and bias. #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YGB 32 /* 64 / 2 */ // U and V contributions to R,G,B. #define UB -113 /* round(-1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR -90 /* round(-1.40200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR -90 /* round(-1.40200 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) -const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) -const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else -const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; -const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1143,81 +1140,76 @@ const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { #undef YG // BT.709 YUV to RGB reference -// * R = Y - V * -1.28033 -// * G = Y - U * 0.21482 - V * 0.38059 -// * B = Y - U * -2.12798 +// R = (Y - 16) * 1.164 - V * -1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 - U * -2.112 +// See also http://www.equasys.de/colorconversion.html // Y contribution to R,G,B. Scale and bias. -#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -// TODO(fbarchard): Find way to express 2.12 instead of 2.0. +// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.12798 * 64)) */ -#define UG 14 /* round(0.21482 * 64) */ -#define VG 24 /* round(0.38059 * 64) */ -#define VR -82 /* round(-1.28033 * 64) */ +#define UB -128 /* max(-128, round(-2.112 * 64)) */ +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR -115 /* round(-1.793 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) -const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) -const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; -const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else -const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; -const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; +const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1231,8 +1223,12 @@ const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #undef YG // C reference code that mimics the YUV assembly. -static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r, +static __inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, + uint8* b, + uint8* g, + uint8* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; @@ -1264,13 +1260,13 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, #endif uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32)(-(u * ub ) + y1 + bb) >> 6); + *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6); *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32)(-( v * vr) + y1 + br) >> 6); + *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); } // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // C reference code that mimics the YUV assembly. @@ -1310,8 +1306,8 @@ void I444ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1324,8 +1320,8 @@ void I444ToARGBRow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1344,11 +1340,11 @@ void I422ToARGBRow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1356,8 +1352,8 @@ void I422ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1371,11 +1367,11 @@ void I422AlphaToARGBRow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; @@ -1384,8 +1380,8 @@ void I422AlphaToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } @@ -1398,18 +1394,18 @@ void I422ToRGB24Row_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -1435,8 +1431,8 @@ void I422ToARGB4444Row_C(const uint8* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | 0xf000f000; src_y += 2; src_u += 1; src_v += 1; @@ -1447,8 +1443,7 @@ void I422ToARGB4444Row_C(const uint8* src_y, b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; - *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - 0xf000; + *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; } } @@ -1474,8 +1469,8 @@ void I422ToARGB1555Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | + (g1 << 21) | (r1 << 26) | 0x80008000; src_y += 2; src_u += 1; src_v += 1; @@ -1486,8 +1481,7 @@ void I422ToARGB1555Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; - *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - 0x8000; + *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; } } @@ -1513,8 +1507,8 @@ void I422ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_u += 1; src_v += 1; @@ -1529,48 +1523,6 @@ void I422ToRGB565Row_C(const uint8* src_y, } } -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - YuvPixel(src_y[2], src_u[0], src_v[0], - rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants); - rgb_buf[11] = 255; - YuvPixel(src_y[3], src_u[0], src_v[0], - rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants); - rgb_buf[15] = 255; - src_y += 4; - src_u += 1; - src_v += 1; - rgb_buf += 16; // Advance 4 pixels. - } - if (width & 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - void NV12ToARGBRow_C(const uint8* src_y, const uint8* src_uv, uint8* rgb_buf, @@ -1578,19 +1530,19 @@ void NV12ToARGBRow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_uv[0], src_uv[1], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1602,19 +1554,19 @@ void NV21ToARGBRow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1640,8 +1592,8 @@ void NV12ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. @@ -1661,18 +1613,18 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1683,18 +1635,18 @@ void UYVYToARGBRow_C(const uint8* src_uyvy, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1707,11 +1659,11 @@ void I422ToRGBARow_C(const uint8* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -1719,8 +1671,8 @@ void I422ToRGBARow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } @@ -1800,7 +1752,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_C(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -1837,8 +1791,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { } // Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_C(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { // Output a row of UV values, filtering 2 rows of YUY2. int x; for (x = 0; x < width; x += 2) { @@ -1852,7 +1809,9 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, // Copy row of YUY2 UV's (422) into U and V (422). void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1879,8 +1838,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } // Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_C(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1894,7 +1856,9 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, // Copy row of UYVY UV's (422) into U and V (422). void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1925,8 +1889,10 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_C(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { int x; for (x = 0; x < width - 1; x += 2) { uint32 fb = src_argb0[0]; @@ -1973,9 +1939,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef BLEND -#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8 -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst[0] = UBLEND(src0[0], src1[0], alpha[0]); @@ -2039,38 +2008,43 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) const uint32 fixed_invtbl8[256] = { - 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), - T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), - T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), - T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), - T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), - T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), - T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), - T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), - T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), - T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), - T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), - T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), - T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), - T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), - T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), - T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), - T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), - T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), - T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), - T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), - T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), - T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), - T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), - T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), - T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), - T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), - T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), - T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { @@ -2094,8 +2068,10 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { +void ComputeCumulativeSumRow_C(const uint8* row, + int32* cumsum, + const int32* previous_cumsum, + int width) { int32 row_sum[4] = {0, 0, 0, 0}; int x; for (x = 0; x < width; ++x) { @@ -2103,15 +2079,19 @@ void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, row_sum[1] += row[x * 4 + 1]; row_sum[2] += row[x * 4 + 2]; row_sum[3] += row[x * 4 + 3]; - cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; - cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; - cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; - cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; } } -void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, - int w, int area, uint8* dst, int count) { +void CumulativeSumToAverageRow_C(const int32* tl, + const int32* bl, + int w, + int area, + uint8* dst, + int count) { float ooa = 1.0f / area; int i; for (i = 0; i < count; ++i) { @@ -2127,8 +2107,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, // Copy pixels from rotated source to destination row with a slope. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +void ARGBAffineRow_C(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width) { int i; // Render a row of pixels from source into a buffer. float uv[2]; @@ -2138,8 +2121,7 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, int x = (int)(uv[0]); int y = (int)(uv[1]); *(uint32*)(dst_argb) = - *(const uint32*)(src_argb + y * src_argb_stride + - x * 4); + *(const uint32*)(src_argb + y * src_argb_stride + x * 4); dst_argb += 4; uv[0] += uv_dudv[2]; uv[1] += uv_dudv[3]; @@ -2147,16 +2129,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } // Blend 2 rows into 1. -static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride, - uint8* dst_uv, int width) { +static void HalfRow_C(const uint8* src_uv, + ptrdiff_t src_uv_stride, + uint8* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, - uint16* dst_uv, int width) { +static void HalfRow_16_C(const uint16* src_uv, + ptrdiff_t src_uv_stride, + uint16* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; @@ -2164,10 +2150,12 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, } // C version 2x2 -> 2x1. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, +void InterpolateRow_C(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { - int y1_fraction = source_y_fraction ; + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; int x; @@ -2194,9 +2182,11 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } } -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, +void InterpolateRow_16_C(uint16* dst_ptr, + const uint16* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { + int width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint16* src_ptr1 = src_ptr + src_stride; @@ -2222,8 +2212,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_C(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; int index2 = shuffler[2]; @@ -2248,7 +2240,8 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_frame, int width) { + uint8* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_y[0]; @@ -2271,7 +2264,8 @@ void I422ToYUY2Row_C(const uint8* src_y, void I422ToUYVYRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_frame, int width) { + uint8* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_u[0]; @@ -2291,7 +2285,6 @@ void I422ToUYVYRow_C(const uint8* src_y, } } - void ARGBPolynomialRow_C(const uint8* src_argb, uint8* dst_argb, const float* poly, @@ -2332,8 +2325,30 @@ void ARGBPolynomialRow_C(const uint8* src_argb, } } -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16)((*(uint32_t*)&value) >> 13); + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, + uint8* dst_argb, + int width, + const uint8* luma, + uint32 lumacoeff) { uint32 bc = lumacoeff & 0xff; uint32 gc = (lumacoeff >> 8) & 0xff; uint32 rc = (lumacoeff >> 16) & 0xff; @@ -2341,15 +2356,17 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, int i; for (i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; const uint8* luma1; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; - luma1 = ((src_argb[4] * bc + src_argb[5] * gc + - src_argb[6] * rc) & 0x7F00u) + luma; + luma1 = + ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + + luma; dst_argb[4] = luma1[src_argb[4]]; dst_argb[5] = luma1[src_argb[5]]; dst_argb[6] = luma1[src_argb[6]]; @@ -2359,8 +2376,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, } if (width & 1) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; @@ -2504,7 +2522,7 @@ void I422ToRGB565Row_AVX2(const uint8* src_y, uint8* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2530,7 +2548,7 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2556,7 +2574,7 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2576,13 +2594,13 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, #if defined(HAS_I422TORGB24ROW_AVX2) void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2604,7 +2622,7 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc new file mode 100644 index 00000000..466dd5d9 --- /dev/null +++ b/files/source/row_dspr2.cc @@ -0,0 +1,1721 @@ +/* + * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +#ifdef HAS_COPYROW_MIPS +void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { + __asm__ __volatile__( + ".set noreorder \n" + ".set noat \n" + "slti $at, %[count], 8 \n" + "bne $at ,$zero, $last8 \n" + "xor $t8, %[src], %[dst] \n" + "andi $t8, $t8, 0x3 \n" + + "bne $t8, $zero, unaligned \n" + "negu $a3, %[dst] \n" + // make dst/src aligned + "andi $a3, $a3, 0x3 \n" + "beq $a3, $zero, $chk16w \n" + // word-aligned now count is the remining bytes count + "subu %[count], %[count], $a3 \n" + + "lwr $t8, 0(%[src]) \n" + "addu %[src], %[src], $a3 \n" + "swr $t8, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + + // Now the dst/src are mutually word-aligned with word-aligned addresses + "$chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, chk8w \n" + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" + // t0 is the "past the end" address + + // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be + // past + // the "t0-32" address + // This means: for x=128 the last "safe" a1 address is "t0-160" + // Alternatively, for x=64 the last "safe" a1 address is "t0-96" + // we will use "pref 30,128(a1)", so "t0-160" is the limit + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line of src + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $loop16w \n" + "nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$loop16w: \n" + "pref 0, 96(%[src]) \n" + "lw $t0, 0(%[src]) \n" + "bgtz $v1, $skip_pref30_96 \n" // skip + "lw $t1, 4(%[src]) \n" + "pref 30, 96(%[dst]) \n" // continue + "$skip_pref30_96: \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lw $t0, 32(%[src]) \n" + "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) + "lw $t1, 36(%[src]) \n" + "pref 30, 128(%[dst]) \n" // set dest, addr 128 + "$skip_pref30_128: \n" + "lw $t2, 40(%[src]) \n" + "lw $t3, 44(%[src]) \n" + "lw $t4, 48(%[src]) \n" + "lw $t5, 52(%[src]) \n" + "lw $t6, 56(%[src]) \n" + "lw $t7, 60(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst], %[dst], 64 \n" // adding 64 to dest + "sgtu $v1, %[dst], $t9 \n" + "bne %[dst], $a3, $loop16w \n" + " addiu %[src], %[src], 64 \n" // adding 64 to src + "move %[count], $t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count past 32-bytes + "beq %[count], $t8, chk1w \n" + // count=t8,no 32-byte chunk + " nop \n" + + "lw $t0, 0(%[src]) \n" + "lw $t1, 4(%[src]) \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, $last8 \n" + " subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + // copying in words (4-byte chunks) + "$wordCopy_loop: \n" + "lw $t3, 0(%[src]) \n" + // the first t3 may be equal t0 ... optimize? + "addiu %[src], %[src],4 \n" + "addiu %[dst], %[dst],4 \n" + "bne %[dst], $a3,$wordCopy_loop \n" + " sw $t3, -4(%[dst]) \n" + + // For the last (<8) bytes + "$last8: \n" + "blez %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 -last dst address + "$last8loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst], $a3, $last8loop \n" + " sb $v1, -1(%[dst]) \n" + + "leave: \n" + " j $ra \n" + " nop \n" + + // + // UNALIGNED case + // + + "unaligned: \n" + // got here with a3="negu a1" + "andi $a3, $a3, 0x3 \n" // a1 is word aligned? + "beqz $a3, $ua_chk16w \n" + " subu %[count], %[count], $a3 \n" + // bytes left after initial a3 bytes + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 + "swr $v1, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + // below the dst will be word aligned (NOTE1) + "$ua_chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, ua_chk8w \n" + // if a2==t8, no 64-byte chunks + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" // t0 "past the end" + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line addr 32 + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // safe, as we have at least 64 bytes ahead + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $ua_loop16w \n" + // skip "pref 30,64(a1)" for too short arrays + " nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$ua_loop16w: \n" + "pref 0, 96(%[src]) \n" + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "bgtz $v1, $ua_skip_pref30_96 \n" + " lwl $t1, 7(%[src]) \n" + "pref 30, 96(%[dst]) \n" + // continue setting up the dest, addr 96 + "$ua_skip_pref30_96: \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lwr $t0, 32(%[src]) \n" + "lwl $t0, 35(%[src]) \n" + "lwr $t1, 36(%[src]) \n" + "bgtz $v1, ua_skip_pref30_128 \n" + " lwl $t1, 39(%[src]) \n" + "pref 30, 128(%[dst]) \n" + // continue setting up the dest, addr 128 + "ua_skip_pref30_128: \n" + + "lwr $t2, 40(%[src]) \n" + "lwl $t2, 43(%[src]) \n" + "lwr $t3, 44(%[src]) \n" + "lwl $t3, 47(%[src]) \n" + "lwr $t4, 48(%[src]) \n" + "lwl $t4, 51(%[src]) \n" + "lwr $t5, 52(%[src]) \n" + "lwl $t5, 55(%[src]) \n" + "lwr $t6, 56(%[src]) \n" + "lwl $t6, 59(%[src]) \n" + "lwr $t7, 60(%[src]) \n" + "lwl $t7, 63(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst],%[dst],64 \n" // adding 64 to dest + "sgtu $v1,%[dst],$t9 \n" + "bne %[dst],$a3,$ua_loop16w \n" + " addiu %[src],%[src],64 \n" // adding 64 to src + "move %[count],$t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "ua_chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count + "beq %[count], $t8, $ua_chk1w \n" + // when count==t8, no 32-byte chunk + + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "lwl $t1, 7(%[src]) \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "$ua_chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, ua_smallCopy \n" + "subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + + // copying in words (4-byte chunks) + "$ua_wordCopy_loop: \n" + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addiu %[src], %[src], 4 \n" + "addiu %[dst], %[dst], 4 \n" + // note: dst=a1 is word aligned here, see NOTE1 + "bne %[dst], $a3, $ua_wordCopy_loop \n" + " sw $v1,-4(%[dst]) \n" + + // Now less than 4 bytes (value in count) left to copy + "ua_smallCopy: \n" + "beqz %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 = last dst address + "$ua_smallCopy_loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst],$a3,$ua_smallCopy_loop \n" + " sb $v1, -1(%[dst]) \n" + + "j $ra \n" + " nop \n" + ".set at \n" + ".set reorder \n" + : [dst] "+r"(dst), [src] "+r"(src) + : [count] "r"(count) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1", + "at"); +} +#endif // HAS_COPYROW_MIPS + +// DSPR2 functions +#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \ + (__mips_isa_rev < 6) + +void SplitUVRow_DSPR2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + "1: \n" + "addiu $t4, $t4, -1 \n" + "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | + // U10 + "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | + // U12 + "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | + // U14 + "addiu %[src_uv], %[src_uv], 32 \n" + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | + // V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | + // U12 + "sw $t9, 0(%[dst_v]) \n" + "sw $t0, 0(%[dst_u]) \n" + "sw $t1, 4(%[dst_v]) \n" + "sw $t2, 4(%[dst_u]) \n" + "sw $t3, 8(%[dst_v]) \n" + "sw $t5, 8(%[dst_u]) \n" + "sw $t6, 12(%[dst_v]) \n" + "sw $t7, 12(%[dst_u]) \n" + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u), + [dst_v] "+r"(dst_v) + : + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); +} + +void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "srl $t4, %[width], 4 \n" // multiplies of 16 + "andi $t5, %[width], 0xf \n" + "blez $t4, 2f \n" + " addu %[src], %[src], %[width] \n" // src += width + + "1: \n" + "lw $t0, -16(%[src]) \n" // |3|2|1|0| + "lw $t1, -12(%[src]) \n" // |7|6|5|4| + "lw $t2, -8(%[src]) \n" // |11|10|9|8| + "lw $t3, -4(%[src]) \n" // |15|14|13|12| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t1, $t1 \n" // |6|7|4|5| + "wsbh $t2, $t2 \n" // |10|11|8|9| + "wsbh $t3, $t3 \n" // |14|15|12|13| + "rotr $t0, $t0, 16 \n" // |0|1|2|3| + "rotr $t1, $t1, 16 \n" // |4|5|6|7| + "rotr $t2, $t2, 16 \n" // |8|9|10|11| + "rotr $t3, $t3, 16 \n" // |12|13|14|15| + "addiu %[src], %[src], -16 \n" + "addiu $t4, $t4, -1 \n" + "sw $t3, 0(%[dst]) \n" // |15|14|13|12| + "sw $t2, 4(%[dst]) \n" // |11|10|9|8| + "sw $t1, 8(%[dst]) \n" // |7|6|5|4| + "sw $t0, 12(%[dst]) \n" // |3|2|1|0| + "bgtz $t4, 1b \n" + " addiu %[dst], %[dst], 16 \n" + "beqz $t5, 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, -1(%[src]) \n" + "addiu $t5, $t5, -1 \n" + "addiu %[src], %[src], -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgez $t5, 2b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src] "+r"(src), [dst] "+r"(dst) + : [width] "r"(width) + : "t0", "t1", "t2", "t3", "t4", "t5"); +} + +void MirrorUVRow_DSPR2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + int y; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "addu $t4, %[width], %[width] \n" + "srl %[x], %[width], 4 \n" + "andi %[y], %[width], 0xf \n" + "blez %[x], 2f \n" + " addu %[src_uv], %[src_uv], $t4 \n" + + "1: \n" + "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| + "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| + "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| + "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| + "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| + "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| + "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| + "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| + + "rotr $t0, $t0, 16 \n" // |1|0|3|2| + "rotr $t1, $t1, 16 \n" // |5|4|7|6| + "rotr $t2, $t2, 16 \n" // |9|8|11|10| + "rotr $t3, $t3, 16 \n" // |13|12|15|14| + "rotr $t4, $t4, 16 \n" // |17|16|19|18| + "rotr $t6, $t6, 16 \n" // |21|20|23|22| + "rotr $t7, $t7, 16 \n" // |25|24|27|26| + "rotr $t8, $t8, 16 \n" // |29|28|31|30| + "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| + "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| + "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| + "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| + "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| + "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| + "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| + "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| + "addiu %[src_uv], %[src_uv], -32 \n" + "addiu %[x], %[x], -1 \n" + "swr $t4, 0(%[dst_u]) \n" + "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| + "swr $t6, 0(%[dst_v]) \n" + "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| + "swr $t3, 4(%[dst_v]) \n" + "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| + "swr $t0, 8(%[dst_u]) \n" + "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| + "swr $t1, 8(%[dst_v]) \n" + "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| + "swr $t9, 12(%[dst_u]) \n" + "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| + "swr $t5, 12(%[dst_v]) \n" + "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz %[x], 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + "beqz %[y], 3f \n" + " nop \n" + "b 2f \n" + " nop \n" + + "2: \n" + "lbu $t0, -2(%[src_uv]) \n" + "lbu $t1, -1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], -2 \n" + "addiu %[y], %[y], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[y], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v), + [x] "=&r"(x), [y] "=&r"(y) + : [width] "r"(width) + : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9"); +} + +void I422ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint32 tmp_ub = yuvconstants->kUVToB[0]; + uint32 tmp_ug = yuvconstants->kUVToG[0]; + uint32 tmp_vg = yuvconstants->kUVToG[1]; + uint32 tmp_vr = yuvconstants->kUVToR[1]; + uint32 tmp_bb = yuvconstants->kUVBiasB[0]; + uint32 tmp_bg = yuvconstants->kUVBiasG[0]; + uint32 tmp_br = yuvconstants->kUVBiasR[0]; + uint32 yg = yuvconstants->kYToRgb[0]; + uint32 tmp_yg; + uint32 tmp_mask = 0x7fff7fff; + tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); + tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); + tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); + tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); + tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; + tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); + tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); + tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; + yg = yg * 0x0101; + + for (x = 0; x < width - 1; x += 2) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lbu %[tmp_t7], 0(%[src_y]) \n" + "lbu %[tmp_t1], 1(%[src_y]) \n" + "mul %[tmp_t7], %[tmp_t7], %[yg] \n" + "mul %[tmp_t1], %[tmp_t1], %[yg] \n" + "lbu %[tmp_t2], 0(%[src_u]) \n" + "lbu %[tmp_t3], 0(%[src_v]) \n" + "replv.ph %[tmp_t2], %[tmp_t2] \n" + "replv.ph %[tmp_t3], %[tmp_t3] \n" + "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" + "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" + "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" + "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" + "srl %[tmp_t7], %[tmp_t7], 16 \n" + "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" + "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" + "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" + "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" + "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" + "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" + "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" + "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" + "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" + "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" + "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" + "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" + "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" + "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" + "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" + "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n" + "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" + "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n" + "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "sw %[tmp_t8], 0(%[rgb_buf]) \n" + "sw %[tmp_t7], 4(%[rgb_buf]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) + : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), + [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg), + [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), + [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), + [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask)); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 4 pixels. + } +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_DSPR2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y0_fraction = 256 - source_y_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "replv.ph $t0, %[y0_fraction] \n" + "replv.ph $t1, %[source_y_fraction] \n" + + "1: \n" + "lw $t2, 0(%[src_ptr]) \n" + "lw $t3, 0(%[src_ptr1]) \n" + "lw $t4, 4(%[src_ptr]) \n" + "lw $t5, 4(%[src_ptr1]) \n" + "muleu_s.ph.qbl $t6, $t2, $t0 \n" + "muleu_s.ph.qbr $t7, $t2, $t0 \n" + "muleu_s.ph.qbl $t8, $t3, $t1 \n" + "muleu_s.ph.qbr $t9, $t3, $t1 \n" + "muleu_s.ph.qbl $t2, $t4, $t0 \n" + "muleu_s.ph.qbr $t3, $t4, $t0 \n" + "muleu_s.ph.qbl $t4, $t5, $t1 \n" + "muleu_s.ph.qbr $t5, $t5, $t1 \n" + "addq.ph $t6, $t6, $t8 \n" + "addq.ph $t7, $t7, $t9 \n" + "addq.ph $t2, $t2, $t4 \n" + "addq.ph $t3, $t3, $t5 \n" + "shra_r.ph $t6, $t6, 8 \n" + "shra_r.ph $t7, $t7, 8 \n" + "shra_r.ph $t2, $t2, 8 \n" + "shra_r.ph $t3, $t3, 8 \n" + "precr.qb.ph $t6, $t6, $t7 \n" + "precr.qb.ph $t2, $t2, $t3 \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[src_ptr1], %[src_ptr1], 8 \n" + "addiu %[dst_width], %[dst_width], -8 \n" + "sw $t6, 0(%[dst_ptr]) \n" + "sw $t2, 4(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[dst_ptr], %[dst_ptr], 8 \n" + + ".set pop \n" + : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1), + [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width) + : [source_y_fraction] "r"(source_y_fraction), + [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); +} +#include <stdio.h> +void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) { + int x; + uint32 tmp_mask = 0xff; + uint32 tmp_t1; + for (x = 0; x < (width - 1); ++x) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "ulw %[tmp_t1], 0(%[src_rgb24]) \n" + "addiu %[dst_argb], %[dst_argb], 4 \n" + "addiu %[src_rgb24], %[src_rgb24], 3 \n" + "ins %[tmp_t1], %[tmp_mask], 24, 8 \n" + "sw %[tmp_t1], -4(%[dst_argb]) \n" + ".set pop \n" + : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb), + [tmp_t1] "=&r"(tmp_t1) + : [tmp_mask] "r"(tmp_mask) + : "memory"); + } + uint8 b = src_rgb24[0]; + uint8 g = src_rgb24[1]; + uint8 r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; +} + +void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) { + int x; + uint32 tmp_mask = 0xff; + uint32 tmp_t1, tmp_t2; + for (x = 0; x < (width - 1); ++x) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "ulw %[tmp_t1], 0(%[src_raw]) \n" + "addiu %[dst_argb], %[dst_argb], 4 \n" + "addiu %[src_raw], %[src_raw], 3 \n" + "srl %[tmp_t2], %[tmp_t1], 16 \n" + "ins %[tmp_t1], %[tmp_mask], 24, 8 \n" + "ins %[tmp_t1], %[tmp_t1], 16, 8 \n" + "ins %[tmp_t1], %[tmp_t2], 0, 8 \n" + "sw %[tmp_t1], -4(%[dst_argb]) \n" + ".set pop \n" + : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb), + [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2) + : [tmp_mask] "r"(tmp_mask) + : "memory"); + } + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; +} + +void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, + uint8* dst_argb, + int width) { + int x; + uint32 tmp_mask = 0xff; + uint32 tmp_t1, tmp_t2, tmp_t3; + for (x = 0; x < width; ++x) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lhu %[tmp_t1], 0(%[src_rgb565]) \n" + "addiu %[dst_argb], %[dst_argb], 4 \n" + "addiu %[src_rgb565], %[src_rgb565], 2 \n" + "sll %[tmp_t2], %[tmp_t1], 8 \n" + "ins %[tmp_t2], %[tmp_mask], 24,8 \n" + "ins %[tmp_t2], %[tmp_t1], 3, 16 \n" + "ins %[tmp_t2], %[tmp_t1], 5, 11 \n" + "srl %[tmp_t3], %[tmp_t1], 9 \n" + "ins %[tmp_t2], %[tmp_t3], 8, 2 \n" + "ins %[tmp_t2], %[tmp_t1], 3, 5 \n" + "srl %[tmp_t3], %[tmp_t1], 2 \n" + "ins %[tmp_t2], %[tmp_t3], 0, 3 \n" + "sw %[tmp_t2], -4(%[dst_argb]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565), + [dst_argb] "+r"(dst_argb) + : [tmp_mask] "r"(tmp_mask)); + } +} + +void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555, + uint8* dst_argb, + int width) { + int x; + uint32 tmp_t1, tmp_t2, tmp_t3; + for (x = 0; x < width; ++x) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lh %[tmp_t1], 0(%[src_argb1555]) \n" + "addiu %[dst_argb], %[dst_argb], 4 \n" + "addiu %[src_argb1555], %[src_argb1555], 2 \n" + "sll %[tmp_t2], %[tmp_t1], 9 \n" + "ins %[tmp_t2], %[tmp_t1], 4, 15 \n" + "ins %[tmp_t2], %[tmp_t1], 6, 10 \n" + "srl %[tmp_t3], %[tmp_t1], 7 \n" + "ins %[tmp_t2], %[tmp_t3], 8, 3 \n" + "ins %[tmp_t2], %[tmp_t1], 3, 5 \n" + "srl %[tmp_t3], %[tmp_t1], 2 \n" + "ins %[tmp_t2], %[tmp_t3], 0, 3 \n" + "sw %[tmp_t2], -4(%[dst_argb]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555), + [dst_argb] "+r"(dst_argb) + :); + } +} + +void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444, + uint8* dst_argb, + int width) { + int x; + uint32 tmp_t1; + for (x = 0; x < width; ++x) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lh %[tmp_t1], 0(%[src_argb4444]) \n" + "addiu %[dst_argb], %[dst_argb], 4 \n" + "addiu %[src_argb4444], %[src_argb4444], 2 \n" + "ins %[tmp_t1], %[tmp_t1], 16, 16 \n" + "ins %[tmp_t1], %[tmp_t1], 12, 16 \n" + "ins %[tmp_t1], %[tmp_t1], 8, 12 \n" + "ins %[tmp_t1], %[tmp_t1], 4, 8 \n" + "sw %[tmp_t1], -4(%[dst_argb]) \n" + ".set pop \n" + : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb), + [tmp_t1] "=&r"(tmp_t1)); + } +} + +void I444ToARGBRow_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint32 tmp_ub = yuvconstants->kUVToB[0]; + uint32 tmp_ug = yuvconstants->kUVToG[0]; + uint32 tmp_vg = yuvconstants->kUVToG[1]; + uint32 tmp_vr = yuvconstants->kUVToR[1]; + uint32 tmp_bb = yuvconstants->kUVBiasB[0]; + uint32 tmp_bg = yuvconstants->kUVBiasG[0]; + uint32 tmp_br = yuvconstants->kUVBiasR[0]; + uint32 yg = yuvconstants->kYToRgb[0]; + uint32 tmp_mask = 0x7fff7fff; + uint32 tmp_yg; + + tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); + tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); + tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); + tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); + tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; + tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); + tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); + tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; + yg = yg * 0x0101; + + for (x = 0; x < width - 1; x += 2) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lbu %[tmp_t7], 0(%[y_buf]) \n" + "lbu %[tmp_t1], 1(%[y_buf]) \n" + "mul %[tmp_t7], %[tmp_t7], %[yg] \n" + "mul %[tmp_t1], %[tmp_t1], %[yg] \n" + "lh %[tmp_t2], 0(%[u_buf]) \n" + "lh %[tmp_t3], 0(%[v_buf]) \n" + "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n" + "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" + "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" + "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" + "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" + "srl %[tmp_t7], %[tmp_t7], 16 \n" + "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" + "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" + "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" + "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" + "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" + "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" + "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" + "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" + "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" + "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" + "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" + "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" + "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" + "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" + "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" + "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" + "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" + "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" + "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" + "sw %[tmp_t8], 0(%[rgb_buf]) \n" + "sw %[tmp_t7], 4(%[rgb_buf]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) + : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf), + [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), + [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), + [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), + [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask)); + y_buf += 2; + u_buf += 2; + v_buf += 2; + rgb_buf += 8; // Advance 1 pixel. + } +} + +void I422ToARGB4444Row_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint32 tmp_ub = yuvconstants->kUVToB[0]; + uint32 tmp_ug = yuvconstants->kUVToG[0]; + uint32 tmp_vg = yuvconstants->kUVToG[1]; + uint32 tmp_vr = yuvconstants->kUVToR[1]; + uint32 tmp_bb = yuvconstants->kUVBiasB[0]; + uint32 tmp_bg = yuvconstants->kUVBiasG[0]; + uint32 tmp_br = yuvconstants->kUVBiasR[0]; + uint32 yg = yuvconstants->kYToRgb[0]; + uint32 tmp_yg; + uint32 tmp_mask = 0x7fff7fff; + tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); + tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); + tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); + tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); + tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; + tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); + tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); + tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; + yg = yg * 0x0101; + + for (x = 0; x < width - 1; x += 2) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lbu %[tmp_t7], 0(%[src_y]) \n" + "lbu %[tmp_t1], 1(%[src_y]) \n" + "mul %[tmp_t7], %[tmp_t7], %[yg] \n" + "mul %[tmp_t1], %[tmp_t1], %[yg] \n" + "lbu %[tmp_t2], 0(%[src_u]) \n" + "lbu %[tmp_t3], 0(%[src_v]) \n" + "replv.ph %[tmp_t2], %[tmp_t2] \n" + "replv.ph %[tmp_t3], %[tmp_t3] \n" + "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" + "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" + "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" + "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" + "srl %[tmp_t7], %[tmp_t7], 16 \n" + "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" + "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" + "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" + "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" + "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" + "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" + "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" + "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" + "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" + "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" + "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" + "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" + "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" + "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" + "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" + "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" + "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" + "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" + "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" + "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n" + "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n" + "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n" + "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n" + "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n" + "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n" + "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n" + "sw %[tmp_t8], 0(%[dst_argb4444]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) + : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u), + [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub), + [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), + [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), + [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask)); + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } +} + +void I422ToARGB1555Row_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint32 tmp_ub = yuvconstants->kUVToB[0]; + uint32 tmp_ug = yuvconstants->kUVToG[0]; + uint32 tmp_vg = yuvconstants->kUVToG[1]; + uint32 tmp_vr = yuvconstants->kUVToR[1]; + uint32 tmp_bb = yuvconstants->kUVBiasB[0]; + uint32 tmp_bg = yuvconstants->kUVBiasG[0]; + uint32 tmp_br = yuvconstants->kUVBiasR[0]; + uint32 yg = yuvconstants->kYToRgb[0]; + uint32 tmp_yg; + uint32 tmp_mask = 0x80008000; + tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); + tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); + tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); + tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); + tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; + tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); + tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); + tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; + yg = yg * 0x0101; + + for (x = 0; x < width - 1; x += 2) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lbu %[tmp_t7], 0(%[src_y]) \n" + "lbu %[tmp_t1], 1(%[src_y]) \n" + "mul %[tmp_t7], %[tmp_t7], %[yg] \n" + "mul %[tmp_t1], %[tmp_t1], %[yg] \n" + "lbu %[tmp_t2], 0(%[src_u]) \n" + "lbu %[tmp_t3], 0(%[src_v]) \n" + "replv.ph %[tmp_t2], %[tmp_t2] \n" + "replv.ph %[tmp_t3], %[tmp_t3] \n" + "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" + "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" + "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" + "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" + "srl %[tmp_t7], %[tmp_t7], 16 \n" + "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" + "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" + "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" + "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" + "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" + "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" + "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" + "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" + "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" + "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" + "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" + "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" + "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" + "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" + "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" + "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" + "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" + "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" + "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" + "ins %[tmp_t3], %[tmp_t8], 7, 24 \n" + "ins %[tmp_t3], %[tmp_t8], 10, 16 \n" + "ins %[tmp_t3], %[tmp_t8], 13, 8 \n" + "ins %[tmp_t4], %[tmp_t7], 7, 24 \n" + "ins %[tmp_t4], %[tmp_t7], 10, 16 \n" + "ins %[tmp_t4], %[tmp_t7], 13, 8 \n" + "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n" + "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n" + "sw %[tmp_t8], 0(%[dst_argb1555]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) + : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u), + [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub), + [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), + [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), + [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask)); + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } +} + +void NV12ToARGBRow_DSPR2(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint32 tmp_ub = yuvconstants->kUVToB[0]; + uint32 tmp_ug = yuvconstants->kUVToG[0]; + uint32 tmp_vg = yuvconstants->kUVToG[1]; + uint32 tmp_vr = yuvconstants->kUVToR[1]; + uint32 tmp_bb = yuvconstants->kUVBiasB[0]; + uint32 tmp_bg = yuvconstants->kUVBiasG[0]; + uint32 tmp_br = yuvconstants->kUVBiasR[0]; + uint32 yg = yuvconstants->kYToRgb[0]; + uint32 tmp_mask = 0x7fff7fff; + uint32 tmp_yg; + tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); + tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); + tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); + tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); + tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; + tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); + tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); + tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; + yg = yg * 0x0101; + + for (x = 0; x < width - 1; x += 2) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lbu %[tmp_t7], 0(%[src_y]) \n" + "lbu %[tmp_t1], 1(%[src_y]) \n" + "mul %[tmp_t7], %[tmp_t7], %[yg] \n" + "mul %[tmp_t1], %[tmp_t1], %[yg] \n" + "lbu %[tmp_t2], 0(%[src_uv]) \n" + "lbu %[tmp_t3], 1(%[src_uv]) \n" + "replv.ph %[tmp_t2], %[tmp_t2] \n" + "replv.ph %[tmp_t3], %[tmp_t3] \n" + "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" + "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" + "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" + "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" + "srl %[tmp_t7], %[tmp_t7], 16 \n" + "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" + "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" + "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" + "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" + "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" + "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" + "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" + "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" + "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" + "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" + "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" + "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" + "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" + "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" + "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" + "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" + "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" + "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" + "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" + "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" + "sw %[tmp_t8], 0(%[rgb_buf]) \n" + "sw %[tmp_t7], 4(%[rgb_buf]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) + : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg), + [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), + [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), + [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf), + [tmp_mask] "r"(tmp_mask)); + + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } +} + +void BGRAToUVRow_DSPR2(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; + int x; + int const1 = 0xffda0000; + int const2 = 0x0070ffb6; + int const3 = 0x00700000; + int const4 = 0xffeeffa2; + int const5 = 0x100; + for (x = 0; x < width - 1; x += 2) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_rgb0]) \n" + "lw %[tmp_t2], 4(%[src_rgb0]) \n" + "lw %[tmp_t3], 0(%[src_rgb1]) \n" + "lw %[tmp_t4], 4(%[src_rgb1]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" + "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" + "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" + "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" + "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" + "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" + "extr_r.w %[tmp_t7], $ac0, 9 \n" + "extr_r.w %[tmp_t8], $ac1, 9 \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "addiu %[dst_v], %[dst_v], 1 \n" + "addiu %[src_rgb0], %[src_rgb0], 8 \n" + "addiu %[src_rgb1], %[src_rgb1], 8 \n" + "sb %[tmp_t7], -1(%[dst_u]) \n" + "sb %[tmp_t8], -1(%[dst_v]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), + [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) + : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), + [const4] "r"(const4), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi"); + } +} + +void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + int const1 = 0x00420000; + int const2 = 0x00190081; + int const5 = 0x40; + for (x = 0; x < width; x += 4) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_argb0]) \n" + "lw %[tmp_t2], 4(%[src_argb0]) \n" + "lw %[tmp_t3], 8(%[src_argb0]) \n" + "lw %[tmp_t4], 12(%[src_argb0]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "mult $ac2, %[const5], %[const5] \n" + "mult $ac3, %[const5], %[const5] \n" + "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" + "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" + "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" + "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" + "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" + "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" + "extr_r.w %[tmp_t1], $ac0, 8 \n" + "extr_r.w %[tmp_t2], $ac1, 8 \n" + "extr_r.w %[tmp_t3], $ac2, 8 \n" + "extr_r.w %[tmp_t4], $ac3, 8 \n" + "addiu %[src_argb0],%[src_argb0], 16 \n" + "addiu %[dst_y], %[dst_y], 4 \n" + "sb %[tmp_t1], -4(%[dst_y]) \n" + "sb %[tmp_t2], -3(%[dst_y]) \n" + "sb %[tmp_t3], -2(%[dst_y]) \n" + "sb %[tmp_t4], -1(%[dst_y]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) + : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", + "$ac3hi"); + } +} + +void ABGRToUVRow_DSPR2(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; + int x; + int const1 = 0xffb6ffda; + int const2 = 0x00000070; + int const3 = 0xffa20070; + int const4 = 0x0000ffee; + int const5 = 0x100; + + for (x = 0; x < width - 1; x += 2) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_rgb0]) \n" + "lw %[tmp_t2], 4(%[src_rgb0]) \n" + "lw %[tmp_t3], 0(%[src_rgb1]) \n" + "lw %[tmp_t4], 4(%[src_rgb1]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" + "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" + "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" + "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" + "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" + "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" + "extr_r.w %[tmp_t7], $ac0, 9 \n" + "extr_r.w %[tmp_t8], $ac1, 9 \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "addiu %[dst_v], %[dst_v], 1 \n" + "addiu %[src_rgb0], %[src_rgb0], 8 \n" + "addiu %[src_rgb1], %[src_rgb1], 8 \n" + "sb %[tmp_t7], -1(%[dst_u]) \n" + "sb %[tmp_t8], -1(%[dst_v]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), + [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) + : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), + [const4] "r"(const4), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi"); + } +} + +void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + int const1 = 0x00810019; + int const2 = 0x00000042; + int const5 = 0x40; + for (x = 0; x < width; x += 4) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_argb0]) \n" + "lw %[tmp_t2], 4(%[src_argb0]) \n" + "lw %[tmp_t3], 8(%[src_argb0]) \n" + "lw %[tmp_t4], 12(%[src_argb0]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "mult $ac2, %[const5], %[const5] \n" + "mult $ac3, %[const5], %[const5] \n" + "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" + "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" + "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" + "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" + "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" + "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" + "extr_r.w %[tmp_t1], $ac0, 8 \n" + "extr_r.w %[tmp_t2], $ac1, 8 \n" + "extr_r.w %[tmp_t3], $ac2, 8 \n" + "extr_r.w %[tmp_t4], $ac3, 8 \n" + "addiu %[dst_y], %[dst_y], 4 \n" + "addiu %[src_argb0],%[src_argb0], 16 \n" + "sb %[tmp_t1], -4(%[dst_y]) \n" + "sb %[tmp_t2], -3(%[dst_y]) \n" + "sb %[tmp_t3], -2(%[dst_y]) \n" + "sb %[tmp_t4], -1(%[dst_y]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) + : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", + "$ac3hi"); + } +} + +void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + int const1 = 0x00810042; + int const2 = 0x00000019; + int const5 = 0x40; + for (x = 0; x < width; x += 4) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_argb0]) \n" + "lw %[tmp_t2], 4(%[src_argb0]) \n" + "lw %[tmp_t3], 8(%[src_argb0]) \n" + "lw %[tmp_t4], 12(%[src_argb0]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "mult $ac2, %[const5], %[const5] \n" + "mult $ac3, %[const5], %[const5] \n" + "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" + "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" + "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" + "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" + "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" + "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" + "extr_r.w %[tmp_t1], $ac0, 8 \n" + "extr_r.w %[tmp_t2], $ac1, 8 \n" + "extr_r.w %[tmp_t3], $ac2, 8 \n" + "extr_r.w %[tmp_t4], $ac3, 8 \n" + "addiu %[src_argb0],%[src_argb0], 16 \n" + "addiu %[dst_y], %[dst_y], 4 \n" + "sb %[tmp_t1], -4(%[dst_y]) \n" + "sb %[tmp_t2], -3(%[dst_y]) \n" + "sb %[tmp_t3], -2(%[dst_y]) \n" + "sb %[tmp_t4], -1(%[dst_y]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) + : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", + "$ac3hi"); + } +} + +void RGBAToUVRow_DSPR2(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; + int x; + int const1 = 0xffb60070; + int const2 = 0x0000ffda; + int const3 = 0xffa2ffee; + int const4 = 0x00000070; + int const5 = 0x100; + + for (x = 0; x < width - 1; x += 2) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n" + "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n" + "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n" + "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" + "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" + "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" + "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" + "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" + "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" + "extr_r.w %[tmp_t7], $ac0, 9 \n" + "extr_r.w %[tmp_t8], $ac1, 9 \n" + "addiu %[src_rgb0], %[src_rgb0], 8 \n" + "addiu %[src_rgb1], %[src_rgb1], 8 \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "addiu %[dst_v], %[dst_v], 1 \n" + "sb %[tmp_t7], -1(%[dst_u]) \n" + "sb %[tmp_t8], -1(%[dst_v]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), + [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) + : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), + [const4] "r"(const4), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi"); + } +} + +void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + int const1 = 0x00420081; + int const2 = 0x00190000; + int const5 = 0x40; + for (x = 0; x < width; x += 4) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_argb0]) \n" + "lw %[tmp_t2], 4(%[src_argb0]) \n" + "lw %[tmp_t3], 8(%[src_argb0]) \n" + "lw %[tmp_t4], 12(%[src_argb0]) \n" + "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "mult $ac2, %[const5], %[const5] \n" + "mult $ac3, %[const5], %[const5] \n" + "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" + "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" + "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" + "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" + "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" + "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" + "extr_r.w %[tmp_t1], $ac0, 8 \n" + "extr_r.w %[tmp_t2], $ac1, 8 \n" + "extr_r.w %[tmp_t3], $ac2, 8 \n" + "extr_r.w %[tmp_t4], $ac3, 8 \n" + "addiu %[dst_y], %[dst_y], 4 \n" + "addiu %[src_argb0],%[src_argb0], 16 \n" + "sb %[tmp_t1], -4(%[dst_y]) \n" + "sb %[tmp_t2], -3(%[dst_y]) \n" + "sb %[tmp_t3], -2(%[dst_y]) \n" + "sb %[tmp_t4], -1(%[dst_y]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) + : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", + "$ac3hi"); + } +} + +void ARGBToUVRow_DSPR2(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; + int x; + int const1 = 0xffb60070; + int const2 = 0x0000ffda; + int const3 = 0xffa2ffee; + int const4 = 0x00000070; + int const5 = 0x100; + + for (x = 0; x < width - 1; x += 2) { + int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; + int tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t1], 0(%[src_rgb0]) \n" + "lw %[tmp_t2], 4(%[src_rgb0]) \n" + "lw %[tmp_t3], 0(%[src_rgb1]) \n" + "lw %[tmp_t4], 4(%[src_rgb1]) \n" + "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" + "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" + "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" + "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" + "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" + "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" + "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" + "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" + "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" + "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" + "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" + "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" + "mult $ac0, %[const5], %[const5] \n" + "mult $ac1, %[const5], %[const5] \n" + "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" + "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" + "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" + "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" + "extr_r.w %[tmp_t7], $ac0, 9 \n" + "extr_r.w %[tmp_t8], $ac1, 9 \n" + "addiu %[src_rgb0], %[src_rgb0], 8 \n" + "addiu %[src_rgb1], %[src_rgb1], 8 \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "addiu %[dst_v], %[dst_v], 1 \n" + "sb %[tmp_t7], -1(%[dst_u]) \n" + "sb %[tmp_t8], -1(%[dst_v]) \n" + ".set pop \n" + : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), + [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), + [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), + [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), + [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) + : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), + [const4] "r"(const4), [const5] "r"(const5) + : "hi", "lo", "$ac1lo", "$ac1hi"); + } +} + +#endif // __mips_dsp_rev >= 2 + +#endif // defined(__mips__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc index 1ac7ef1a..8735070b 100644 --- a/files/source/row_gcc.cc +++ b/files/source/row_gcc.cc @@ -1,4 +1,3 @@ -// VERSION 2 /* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * @@ -23,165 +22,133 @@ extern "C" { #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) -static vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; static vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -static vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA -static vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR -static vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -static uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; +static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, + 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; #endif // HAS_RGB24TOARGBROW_SSSE3 @@ -191,7 +158,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -220,7 +187,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" @@ -258,7 +225,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" @@ -296,7 +263,7 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { "movdqa %4,%%xmm4 \n" "movdqa %5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" @@ -338,7 +305,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -385,7 +352,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -429,7 +396,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm4,%%xmm0 \n" @@ -461,7 +428,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -499,7 +466,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -543,7 +510,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0xb,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -569,98 +536,99 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { ); } -void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void ARGBToRGB565DitherRow_SSE2(const uint8* src, + uint8* dst, + const uint32 dither4, + int width) { + asm volatile( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void ARGBToRGB565DitherRow_AVX2(const uint8* src, + uint8* dst, + const uint32 dither4, + int width) { + asm volatile( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 - void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" @@ -671,8 +639,9 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { "pslld $0xa,%%xmm6 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "pslld $0xf,%%xmm7 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -708,8 +677,9 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" @@ -737,8 +707,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -775,8 +746,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -809,9 +781,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { @@ -819,8 +789,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" @@ -860,8 +831,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" @@ -896,15 +868,19 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" @@ -961,18 +937,21 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #ifdef HAS_ARGBTOUVROW_AVX2 // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +void ARGBToUVRow_AVX2(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" @@ -981,7 +960,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" @@ -1004,9 +983,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1024,15 +1003,19 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_AVX2(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" @@ -1085,15 +1068,19 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" @@ -1149,15 +1136,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_SSSE3(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "movdqa %4,%%xmm3 \n" "movdqa %5,%%xmm4 \n" "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1209,8 +1199,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1239,15 +1230,19 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { ); } -void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" @@ -1304,8 +1299,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1338,8 +1334,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" @@ -1368,15 +1365,19 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { ); } -void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" @@ -1429,15 +1430,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ); } -void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" @@ -1493,8 +1498,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 -#define READYUV444 \ - "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUV444 \ + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -1503,8 +1508,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV -#define READYUV422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUV422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -1514,8 +1519,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUVA422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -1526,29 +1531,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" -// Read 2 UV from 411, upsample to 8 UV. -// reading 4 bytes is an msan violation. -// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" -// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) -// pinsrw fails with drmemory -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_TEMP \ - "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ - "movd %[temp],%%xmm0 \n" \ - MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ - "movd %[temp],%%xmm1 \n" \ - "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "punpckldq %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" - // Read 4 UV from NV12, upsample to 8 UV -#define READNV12 \ - "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ +#define READNV12 \ + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ @@ -1556,8 +1541,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV -#define READNV21 \ - "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ +#define READNV21 \ + "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ "pshufb %[kShuffleNV21], %%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ @@ -1565,24 +1550,24 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. -#define READYUY2 \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ +#define READYUY2 \ + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. -#define READUYVY \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ +#define READUYVY \ + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP(yuvconstants) \ - "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ @@ -1590,37 +1575,37 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa %%xmm11,%%xmm0 \n" \ - "pmaddubsw %%xmm8,%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm12,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa %%xmm13,%%xmm2 \n" \ - "pmaddubsw %%xmm10,%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa %%xmm11,%%xmm0 \n" \ + "pmaddubsw %%xmm8,%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm12,%%xmm1 \n" \ + "pmaddubsw %%xmm9,%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm10,%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw %%xmm14,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" #define YUVTORGB_REGS \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ +#define YUVTORGB(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ @@ -1646,8 +1631,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #endif // Store 8 ARGB values. -#define STOREARGB \ - "punpcklbw %%xmm1,%%xmm0 \n" \ +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm5,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm1 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \ @@ -1657,8 +1642,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" // Store 8 RGBA values. -#define STORERGBA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ "punpcklbw %%xmm2,%%xmm1 \n" \ "punpcklbw %%xmm0,%%xmm5 \n" \ "movdqa %%xmm5,%%xmm0 \n" \ @@ -1678,8 +1663,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB @@ -1707,8 +1693,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" "sub %[u_buf],%[v_buf] \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) "punpcklbw %%xmm1,%%xmm0 \n" @@ -1728,7 +1715,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] @@ -1751,8 +1738,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB @@ -1777,11 +1765,13 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" + LABELALIGN - "1: \n" + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB @@ -1792,7 +1782,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] @@ -1801,55 +1791,22 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, : "memory", "cc", NACL_R14 YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_SSSE3 -#ifdef HAS_I411TOARGBROW_SSSE3 -void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int temp; - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV411_TEMP - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [temp]"=&r"(temp), // %[temp] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif - void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB @@ -1863,6 +1820,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, : "memory", "cc", YUVTORGB_REGS // Does not use r14. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, @@ -1870,11 +1828,13 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB @@ -1889,17 +1849,20 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, : "memory", "cc", YUVTORGB_REGS // Does not use r14. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB @@ -1914,17 +1877,20 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, : "memory", "cc", YUVTORGB_REGS // Does not use r14. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB @@ -1939,6 +1905,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, : "memory", "cc", YUVTORGB_REGS // Does not use r14. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, @@ -1951,8 +1918,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA @@ -1972,8 +1940,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 \ - "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUV444_AVX2 \ + "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ @@ -1985,8 +1953,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUV422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ @@ -1998,8 +1966,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ +#define READYUVA422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ @@ -2013,23 +1981,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 \ - "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" - // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ +#define READNV12_AVX2 \ + "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ @@ -2039,8 +1993,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" // Read 8 VU from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ +#define READNV21_AVX2 \ + "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ @@ -2050,53 +2004,57 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ +#define READYUY2_AVX2 \ + "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ +#define READUYVY_AVX2 \ + "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ - "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ - "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ - "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +#define YUVTORGB_AVX2(yuvconstants) \ + "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ + "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ + "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ + "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ + "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + #define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + #else // Convert 16 pixels: 16 UV and 16 Y. + #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ +#define YUVTORGB_AVX2(yuvconstants) \ + "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ @@ -2119,8 +2077,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // Store 16 ARGB values. -#define STOREARGB_AVX2 \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ @@ -2143,8 +2101,9 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2163,39 +2122,6 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READYUV411_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I411TOARGBROW_AVX2 - #if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). @@ -2209,13 +2135,15 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2233,17 +2161,19 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" + LABELALIGN - "1: \n" + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2255,7 +2185,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] @@ -2264,6 +2194,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_AVX2 @@ -2280,8 +2211,9 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2318,11 +2250,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2337,6 +2271,7 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV12TOARGBROW_AVX2 @@ -2348,11 +2283,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2368,6 +2305,7 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV21TOARGBROW_AVX2 @@ -2378,11 +2316,13 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2398,6 +2338,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_YUY2TOARGBROW_AVX2 @@ -2408,11 +2349,13 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2428,6 +2371,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_UYVYTOARGBROW_AVX2 @@ -2442,8 +2386,9 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "pslld $0x18,%%xmm4 \n" + LABELALIGN - "1: \n" + "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" @@ -2491,7 +2436,7 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN - "1: \n" + "1: \n" // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" @@ -2525,16 +2470,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "movdqa %3,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" @@ -2556,8 +2501,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "vbroadcastf128 %3,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" @@ -2578,18 +2524,20 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, +static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorUVRow_SSSE3(const uint8* src, + uint8* dst_u, + uint8* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "movdqa %4,%%xmm1 \n" "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" "pshufb %%xmm1,%%xmm0 \n" @@ -2615,8 +2563,9 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "pshufd $0x1b,%%xmm0,%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" @@ -2636,15 +2585,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "vmovdqu %3,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" @@ -2662,31 +2610,34 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_AVX2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2699,30 +2650,33 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_SSE2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2735,25 +2689,28 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_AVX2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width) { asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -2766,23 +2723,26 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_SSE2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width) { asm volatile ( - "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -2801,8 +2761,9 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { "jne 2f \n" "test $0xf,%1 \n" "jne 2f \n" + LABELALIGN - "1: \n" + "1: \n" "movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -2812,6 +2773,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { "sub $0x20,%2 \n" "jg 1b \n" "jmp 9f \n" + LABELALIGN "2: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" @@ -2837,7 +2799,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { void CopyRow_AVX(const uint8* src, uint8* dst, int count) { asm volatile ( LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -2860,14 +2822,12 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { // Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep movsb " MEMMOVESTRING(0,1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc" - ); + asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS @@ -2879,8 +2839,9 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -2913,8 +2874,9 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { asm volatile ( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -2939,9 +2901,9 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( + asm volatile ( LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ", %%xmm0 \n" "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" "lea " MEMLEA(0x20, 0) ", %0 \n" @@ -2963,6 +2925,47 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; + +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { + asm volatile ( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" + "lea " MEMLEA(0x80, 0) ", %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { @@ -2971,8 +2974,9 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" + LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm2,%%xmm2 \n" @@ -3007,8 +3011,9 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { asm volatile ( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN - "1: \n" + "1: \n" "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" @@ -3036,32 +3041,29 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { void SetRow_X86(uint8* dst, uint8 v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } void SetRow_ERMS(uint8* dst, uint8 v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosb " MEMSTORESTRING(al,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 @@ -3070,8 +3072,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3091,14 +3094,18 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { ); } -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 @@ -3130,13 +3137,16 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3166,7 +3176,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3186,14 +3196,18 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { ); } -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_SSE2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 @@ -3225,13 +3239,16 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3264,8 +3281,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -3287,14 +3305,18 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { ); } -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 @@ -3327,13 +3349,16 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -3366,7 +3391,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile ( LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -3387,15 +3412,18 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { , "xmm0", "xmm1", "xmm5" ); } -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_AVX2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 @@ -3428,13 +3456,16 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" + LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -3467,14 +3498,14 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_SSSE3(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" @@ -3559,46 +3590,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" - ); +void BlendPlaneRow_SSSE3(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } #endif // HAS_BLENDPLANEROW_SSSE3 @@ -3608,67 +3642,67 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" - "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" - "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 32 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void BlendPlaneRow_AVX2(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha -static uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u -}; -static uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u -}; +static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( @@ -3679,7 +3713,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm1 \n" @@ -3714,9 +3748,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( @@ -3727,7 +3761,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" @@ -3757,13 +3791,14 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, + uint8* dst_argb, int width) { uintptr_t alpha; asm volatile ( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movzb " MEMACCESS2(0x03,0) ",%3 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -3804,10 +3839,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // Unattenuate 8 pixels at a time. -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, + uint8* dst_argb, int width) { uintptr_t alpha; asm volatile ( @@ -3816,7 +3851,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" // replace VPGATHER "movzb " MEMACCESS2(0x03,0) ",%3 \n" MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 @@ -3879,7 +3914,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" @@ -3922,17 +3957,14 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone -static vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { @@ -3943,7 +3975,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" @@ -3995,8 +4027,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width) { asm volatile ( "movdqu " MEMACCESS(3) ",%%xmm5 \n" "pshufd $0x00,%%xmm5,%%xmm2 \n" @@ -4006,7 +4040,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" @@ -4058,8 +4092,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_SSE2(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { asm volatile ( "movd %2,%%xmm2 \n" "movd %3,%%xmm3 \n" @@ -4076,7 +4113,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" @@ -4108,7 +4145,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value) { asm volatile ( "movd %3,%%xmm2 \n" @@ -4117,7 +4156,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -4144,14 +4183,16 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqu " MEMACCESS(1) ",%%xmm2 \n" @@ -4182,14 +4223,16 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" @@ -4221,12 +4264,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqu " MEMACCESS(1) ",%%xmm1 \n" @@ -4249,12 +4294,14 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" @@ -4277,12 +4324,14 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqu " MEMACCESS(1) ",%%xmm1 \n" @@ -4305,12 +4354,14 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" @@ -4318,7 +4369,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, "vmovdqu %%ymm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x20,2) ",%2 \n" "sub $0x8,%3 \n" - "jg 1b \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4336,8 +4387,11 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +void SobelXRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width) { asm volatile ( "sub %0,%1 \n" "sub %0,%2 \n" @@ -4346,7 +4400,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" @@ -4390,8 +4444,10 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width) { asm volatile ( "sub %0,%1 \n" "sub %0,%2 \n" @@ -4399,7 +4455,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 "punpcklbw %%xmm5,%%xmm0 \n" @@ -4443,8 +4499,10 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -4452,7 +4510,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" @@ -4490,8 +4548,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { asm volatile ( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -4499,7 +4559,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" @@ -4525,15 +4585,17 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" @@ -4572,8 +4634,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { +void ComputeCumulativeSumRow_SSE2(const uint8* row, + int32* cumsum, + const int32* previous_cumsum, + int width) { asm volatile ( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" @@ -4582,9 +4646,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "test $0xf,%1 \n" "jne 49f \n" - // 4 pixel loop \n" + // 4 pixel loop. LABELALIGN - "40: \n" + "40: \n" "movdqu " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm2,%%xmm4 \n" @@ -4617,13 +4681,13 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "sub $0x4,%3 \n" "jge 40b \n" - "49: \n" + "49: \n" "add $0x3,%3 \n" "jl 19f \n" - // 1 pixel loop \n" + // 1 pixel loop. LABELALIGN - "10: \n" + "10: \n" "movd " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x4,0) ",%0 \n" "punpcklbw %%xmm1,%%xmm2 \n" @@ -4637,7 +4701,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "sub $0x1,%3 \n" "jge 10b \n" - "19: \n" + "19: \n" : "+r"(row), // %0 "+r"(cumsum), // %1 "+r"(previous_cumsum), // %2 @@ -4650,8 +4714,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32* topleft, + const int32* botleft, + int width, + int area, + uint8* dst, int count) { asm volatile ( "movd %5,%%xmm5 \n" @@ -4672,7 +4739,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, "cvtps2dq %%xmm5,%%xmm5 \n" "packssdw %%xmm5,%%xmm5 \n" - // 4 pixel small loop \n" + // 4 pixel small loop. LABELALIGN "4: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" @@ -4783,8 +4850,11 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* src_dudv, int width) { +void ARGBAffineRow_SSE2(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* src_dudv, + int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile ( @@ -4868,8 +4938,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { asm volatile ( "sub %1,%0 \n" @@ -4891,7 +4963,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // General purpose row blend. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm2) "movdqa %%xmm0,%%xmm1 \n" @@ -4949,8 +5021,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_AVX2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { asm volatile ( "cmp $0x0,%3 \n" @@ -4972,7 +5046,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, // General purpose row blend. LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" @@ -5025,12 +5099,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { asm volatile ( "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" @@ -5053,12 +5129,14 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { asm volatile ( "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" "lea " MEMLEA(0x40,0) ",%0 \n" @@ -5082,8 +5160,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBSHUFFLEROW_SSE2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { uintptr_t pixel_temp; asm volatile ( "pxor %%xmm5,%%xmm5 \n" @@ -5098,7 +5178,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "je 2103f \n" LABELALIGN - "1: \n" + "1: \n" "movzb " MEMACCESS(4) ",%2 \n" MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 "mov %b2," MEMACCESS(1) " \n" @@ -5204,11 +5284,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( + uint8* dst_frame, + int width) { + asm volatile ( "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(1) ",%%xmm2 \n" MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 "lea " MEMLEA(0x8,1) ",%1 \n" @@ -5239,11 +5320,12 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( + uint8* dst_frame, + int width) { + asm volatile ( "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(1) ",%%xmm2 \n" MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 "lea " MEMLEA(0x8,1) ",%1 \n" @@ -5272,14 +5354,15 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, + uint8* dst_argb, + const float* poly, int width) { asm volatile ( "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm3,%%xmm0 \n" @@ -5328,7 +5411,8 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, + uint8* dst_argb, + const float* poly, int width) { asm volatile ( "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" @@ -5338,7 +5422,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, // 2 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels "lea " MEMLEA(0x8,0) ",%0 \n" "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats @@ -5366,15 +5450,150 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; +void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "pshufd $0x0,%3,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + MEMOPMEM(movdqu,xmm2,-0x10,0,1,1) + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale * kScaleBias) // %3 + : "memory", "cc", + "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1) + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale * kScaleBias) // %3 + : "memory", "cc", + "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) + MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale) // %3 + : "memory", "cc", + "xmm2", "xmm3", "xmm4" + ); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) + MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", + "xmm2", "xmm3" + ); +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, +void ARGBColorTableRow_X86(uint8* dst_argb, + const uint8* table_argb, int width) { uintptr_t pixel_temp; asm volatile ( // 1 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movzb " MEMACCESS(0) ",%1 \n" "lea " MEMLEA(0x4,0) ",%0 \n" MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 @@ -5405,7 +5624,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { asm volatile ( // 1 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movzb " MEMACCESS(0) ",%1 \n" "lea " MEMLEA(0x4,0) ",%0 \n" MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 @@ -5428,9 +5647,11 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { + const uint8* luma, + uint32 lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; asm volatile ( @@ -5442,7 +5663,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu " MEMACCESS(2) ",%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "phaddw %%xmm0,%%xmm0 \n" diff --git a/files/source/row_mips.cc b/files/source/row_mips.cc deleted file mode 100644 index 285f0b5a..00000000 --- a/files/source/row_mips.cc +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -#ifdef HAS_COPYROW_MIPS -void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { - __asm__ __volatile__ ( - ".set noreorder \n" - ".set noat \n" - "slti $at, %[count], 8 \n" - "bne $at ,$zero, $last8 \n" - "xor $t8, %[src], %[dst] \n" - "andi $t8, $t8, 0x3 \n" - - "bne $t8, $zero, unaligned \n" - "negu $a3, %[dst] \n" - // make dst/src aligned - "andi $a3, $a3, 0x3 \n" - "beq $a3, $zero, $chk16w \n" - // word-aligned now count is the remining bytes count - "subu %[count], %[count], $a3 \n" - - "lwr $t8, 0(%[src]) \n" - "addu %[src], %[src], $a3 \n" - "swr $t8, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - - // Now the dst/src are mutually word-aligned with word-aligned addresses - "$chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, chk8w \n" - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" - // t0 is the "past the end" address - - // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past - // the "t0-32" address - // This means: for x=128 the last "safe" a1 address is "t0-160" - // Alternatively, for x=64 the last "safe" a1 address is "t0-96" - // we will use "pref 30,128(a1)", so "t0-160" is the limit - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line of src - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $loop16w \n" - "nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$loop16w: \n" - "pref 0, 96(%[src]) \n" - "lw $t0, 0(%[src]) \n" - "bgtz $v1, $skip_pref30_96 \n" // skip - "lw $t1, 4(%[src]) \n" - "pref 30, 96(%[dst]) \n" // continue - "$skip_pref30_96: \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lw $t0, 32(%[src]) \n" - "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) - "lw $t1, 36(%[src]) \n" - "pref 30, 128(%[dst]) \n" // set dest, addr 128 - "$skip_pref30_128: \n" - "lw $t2, 40(%[src]) \n" - "lw $t3, 44(%[src]) \n" - "lw $t4, 48(%[src]) \n" - "lw $t5, 52(%[src]) \n" - "lw $t6, 56(%[src]) \n" - "lw $t7, 60(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst], %[dst], 64 \n" // adding 64 to dest - "sgtu $v1, %[dst], $t9 \n" - "bne %[dst], $a3, $loop16w \n" - " addiu %[src], %[src], 64 \n" // adding 64 to src - "move %[count], $t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count past 32-bytes - "beq %[count], $t8, chk1w \n" - // count=t8,no 32-byte chunk - " nop \n" - - "lw $t0, 0(%[src]) \n" - "lw $t1, 4(%[src]) \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, $last8 \n" - " subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - // copying in words (4-byte chunks) - "$wordCopy_loop: \n" - "lw $t3, 0(%[src]) \n" - // the first t3 may be equal t0 ... optimize? - "addiu %[src], %[src],4 \n" - "addiu %[dst], %[dst],4 \n" - "bne %[dst], $a3,$wordCopy_loop \n" - " sw $t3, -4(%[dst]) \n" - - // For the last (<8) bytes - "$last8: \n" - "blez %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 -last dst address - "$last8loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst], $a3, $last8loop \n" - " sb $v1, -1(%[dst]) \n" - - "leave: \n" - " j $ra \n" - " nop \n" - - // - // UNALIGNED case - // - - "unaligned: \n" - // got here with a3="negu a1" - "andi $a3, $a3, 0x3 \n" // a1 is word aligned? - "beqz $a3, $ua_chk16w \n" - " subu %[count], %[count], $a3 \n" - // bytes left after initial a3 bytes - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 - "swr $v1, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - // below the dst will be word aligned (NOTE1) - "$ua_chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, ua_chk8w \n" - // if a2==t8, no 64-byte chunks - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" // t0 "past the end" - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line addr 32 - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // safe, as we have at least 64 bytes ahead - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $ua_loop16w \n" - // skip "pref 30,64(a1)" for too short arrays - " nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$ua_loop16w: \n" - "pref 0, 96(%[src]) \n" - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "bgtz $v1, $ua_skip_pref30_96 \n" - " lwl $t1, 7(%[src]) \n" - "pref 30, 96(%[dst]) \n" - // continue setting up the dest, addr 96 - "$ua_skip_pref30_96: \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lwr $t0, 32(%[src]) \n" - "lwl $t0, 35(%[src]) \n" - "lwr $t1, 36(%[src]) \n" - "bgtz $v1, ua_skip_pref30_128 \n" - " lwl $t1, 39(%[src]) \n" - "pref 30, 128(%[dst]) \n" - // continue setting up the dest, addr 128 - "ua_skip_pref30_128: \n" - - "lwr $t2, 40(%[src]) \n" - "lwl $t2, 43(%[src]) \n" - "lwr $t3, 44(%[src]) \n" - "lwl $t3, 47(%[src]) \n" - "lwr $t4, 48(%[src]) \n" - "lwl $t4, 51(%[src]) \n" - "lwr $t5, 52(%[src]) \n" - "lwl $t5, 55(%[src]) \n" - "lwr $t6, 56(%[src]) \n" - "lwl $t6, 59(%[src]) \n" - "lwr $t7, 60(%[src]) \n" - "lwl $t7, 63(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst],%[dst],64 \n" // adding 64 to dest - "sgtu $v1,%[dst],$t9 \n" - "bne %[dst],$a3,$ua_loop16w \n" - " addiu %[src],%[src],64 \n" // adding 64 to src - "move %[count],$t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "ua_chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count - "beq %[count], $t8, $ua_chk1w \n" - // when count==t8, no 32-byte chunk - - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "lwl $t1, 7(%[src]) \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "$ua_chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, ua_smallCopy \n" - "subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - - // copying in words (4-byte chunks) - "$ua_wordCopy_loop: \n" - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addiu %[src], %[src], 4 \n" - "addiu %[dst], %[dst], 4 \n" - // note: dst=a1 is word aligned here, see NOTE1 - "bne %[dst], $a3, $ua_wordCopy_loop \n" - " sw $v1,-4(%[dst]) \n" - - // Now less than 4 bytes (value in count) left to copy - "ua_smallCopy: \n" - "beqz %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 = last dst address - "$ua_smallCopy_loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst],$a3,$ua_smallCopy_loop \n" - " sb $v1, -1(%[dst]) \n" - - "j $ra \n" - " nop \n" - ".set at \n" - ".set reorder \n" - : [dst] "+r" (dst), [src] "+r" (src) - : [count] "r" (count) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", - "t8", "t9", "a3", "v1", "at" - ); -} -#endif // HAS_COPYROW_MIPS - -// DSPR2 functions -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) - -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - "1: \n" - "addiu $t4, $t4, -1 \n" - "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "addiu %[src_uv], %[src_uv], 32 \n" - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "sw $t9, 0(%[dst_v]) \n" - "sw $t0, 0(%[dst_u]) \n" - "sw $t1, 4(%[dst_v]) \n" - "sw $t2, 4(%[dst_u]) \n" - "sw $t3, 8(%[dst_v]) \n" - "sw $t5, 8(%[dst_u]) \n" - "sw $t6, 12(%[dst_v]) \n" - "sw $t7, 12(%[dst_u]) \n" - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t4, %[width], 4 \n" // multiplies of 16 - "andi $t5, %[width], 0xf \n" - "blez $t4, 2f \n" - " addu %[src], %[src], %[width] \n" // src += width - - "1: \n" - "lw $t0, -16(%[src]) \n" // |3|2|1|0| - "lw $t1, -12(%[src]) \n" // |7|6|5|4| - "lw $t2, -8(%[src]) \n" // |11|10|9|8| - "lw $t3, -4(%[src]) \n" // |15|14|13|12| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t1, $t1 \n" // |6|7|4|5| - "wsbh $t2, $t2 \n" // |10|11|8|9| - "wsbh $t3, $t3 \n" // |14|15|12|13| - "rotr $t0, $t0, 16 \n" // |0|1|2|3| - "rotr $t1, $t1, 16 \n" // |4|5|6|7| - "rotr $t2, $t2, 16 \n" // |8|9|10|11| - "rotr $t3, $t3, 16 \n" // |12|13|14|15| - "addiu %[src], %[src], -16 \n" - "addiu $t4, $t4, -1 \n" - "sw $t3, 0(%[dst]) \n" // |15|14|13|12| - "sw $t2, 4(%[dst]) \n" // |11|10|9|8| - "sw $t1, 8(%[dst]) \n" // |7|6|5|4| - "sw $t0, 12(%[dst]) \n" // |3|2|1|0| - "bgtz $t4, 1b \n" - " addiu %[dst], %[dst], 16 \n" - "beqz $t5, 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, -1(%[src]) \n" - "addiu $t5, $t5, -1 \n" - "addiu %[src], %[src], -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgez $t5, 2b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src] "+r" (src), [dst] "+r" (dst) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", "t5" - ); -} - -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - int x; - int y; - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "addu $t4, %[width], %[width] \n" - "srl %[x], %[width], 4 \n" - "andi %[y], %[width], 0xf \n" - "blez %[x], 2f \n" - " addu %[src_uv], %[src_uv], $t4 \n" - - "1: \n" - "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| - "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| - "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| - "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| - "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| - "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| - "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| - "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| - - "rotr $t0, $t0, 16 \n" // |1|0|3|2| - "rotr $t1, $t1, 16 \n" // |5|4|7|6| - "rotr $t2, $t2, 16 \n" // |9|8|11|10| - "rotr $t3, $t3, 16 \n" // |13|12|15|14| - "rotr $t4, $t4, 16 \n" // |17|16|19|18| - "rotr $t6, $t6, 16 \n" // |21|20|23|22| - "rotr $t7, $t7, 16 \n" // |25|24|27|26| - "rotr $t8, $t8, 16 \n" // |29|28|31|30| - "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| - "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| - "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| - "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| - "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| - "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| - "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| - "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| - "addiu %[src_uv], %[src_uv], -32 \n" - "addiu %[x], %[x], -1 \n" - "swr $t4, 0(%[dst_u]) \n" - "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| - "swr $t6, 0(%[dst_v]) \n" - "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| - "swr $t3, 4(%[dst_v]) \n" - "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| - "swr $t0, 8(%[dst_u]) \n" - "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| - "swr $t1, 8(%[dst_v]) \n" - "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| - "swr $t9, 12(%[dst_u]) \n" - "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| - "swr $t5, 12(%[dst_v]) \n" - "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz %[x], 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - "beqz %[y], 3f \n" - " nop \n" - "b 2f \n" - " nop \n" - - "2: \n" - "lbu $t0, -2(%[src_uv]) \n" - "lbu $t1, -1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], -2 \n" - "addiu %[y], %[y], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[y], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v), - [x] "=&r" (x), - [y] "=&r" (y) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t7", "t8", "t9" - ); -} - -// Convert (4 Y and 2 VU) I422 and arrange RGB values into -// t5 = | 0 | B0 | 0 | b0 | -// t4 = | 0 | B1 | 0 | b1 | -// t9 = | 0 | G0 | 0 | g0 | -// t8 = | 0 | G1 | 0 | g1 | -// t2 = | 0 | R0 | 0 | r0 | -// t1 = | 0 | R1 | 0 | r1 | -#define YUVTORGB \ - "lw $t0, 0(%[y_buf]) \n" \ - "lhu $t1, 0(%[u_buf]) \n" \ - "lhu $t2, 0(%[v_buf]) \n" \ - "preceu.ph.qbr $t1, $t1 \n" \ - "preceu.ph.qbr $t2, $t2 \n" \ - "preceu.ph.qbra $t3, $t0 \n" \ - "preceu.ph.qbla $t0, $t0 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t3, $t3, $s4 \n" \ - "subu.ph $t0, $t0, $s4 \n" \ - "mul.ph $t3, $t3, $s0 \n" \ - "mul.ph $t0, $t0, $s0 \n" \ - "shll.ph $t4, $t1, 0x7 \n" \ - "subu.ph $t4, $t4, $t1 \n" \ - "mul.ph $t6, $t1, $s1 \n" \ - "mul.ph $t1, $t2, $s2 \n" \ - "addq_s.ph $t5, $t4, $t3 \n" \ - "addq_s.ph $t4, $t4, $t0 \n" \ - "shra.ph $t5, $t5, 6 \n" \ - "shra.ph $t4, $t4, 6 \n" \ - "addiu %[u_buf], 2 \n" \ - "addiu %[v_buf], 2 \n" \ - "addu.ph $t6, $t6, $t1 \n" \ - "mul.ph $t1, $t2, $s3 \n" \ - "addu.ph $t9, $t6, $t3 \n" \ - "addu.ph $t8, $t6, $t0 \n" \ - "shra.ph $t9, $t9, 6 \n" \ - "shra.ph $t8, $t8, 6 \n" \ - "addu.ph $t2, $t1, $t3 \n" \ - "addu.ph $t1, $t1, $t0 \n" \ - "shra.ph $t2, $t2, 6 \n" \ - "shra.ph $t1, $t1, 6 \n" \ - "subu.ph $t5, $t5, $s5 \n" \ - "subu.ph $t4, $t4, $s5 \n" \ - "subu.ph $t9, $t9, $s5 \n" \ - "subu.ph $t8, $t8, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "shll_s.ph $t5, $t5, 8 \n" \ - "shll_s.ph $t4, $t4, 8 \n" \ - "shll_s.ph $t9, $t9, 8 \n" \ - "shll_s.ph $t8, $t8, 8 \n" \ - "shll_s.ph $t2, $t2, 8 \n" \ - "shll_s.ph $t1, $t1, 8 \n" \ - "shra.ph $t5, $t5, 8 \n" \ - "shra.ph $t4, $t4, 8 \n" \ - "shra.ph $t9, $t9, 8 \n" \ - "shra.ph $t8, $t8, 8 \n" \ - "shra.ph $t2, $t2, 8 \n" \ - "shra.ph $t1, $t1, 8 \n" \ - "addu.ph $t5, $t5, $s5 \n" \ - "addu.ph $t4, $t4, $s5 \n" \ - "addu.ph $t9, $t9, $s5 \n" \ - "addu.ph $t8, $t8, $s5 \n" \ - "addu.ph $t2, $t2, $s5 \n" \ - "addu.ph $t1, $t1, $s5 \n" - -// TODO(fbarchard): accept yuv conversion constants. -void I422ToARGBRow_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| - "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n" // |0|16|0|16| - "repl.ph $s5, 128 \n" // |128|128| // clipping - "lui $s6, 0xff00 \n" - "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - - "1: \n" - YUVTORGB -// Arranging into argb format - "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| - "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| - "addiu %[width], -4 \n" - "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| - "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| - "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| - - "addiu %[y_buf], 4 \n" - "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| - "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| - "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| - "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| - "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| - "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| - "sll $t9, $t9, 16 \n" - "sll $t8, $t8, 16 \n" - "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| - "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| -// Store results. - "sw $t2, 0(%[rgb_buf]) \n" - "sw $t0, 4(%[rgb_buf]) \n" - "sw $t1, 8(%[rgb_buf]) \n" - "sw $t3, 12(%[rgb_buf]) \n" - "bnez %[width], 1b \n" - " addiu %[rgb_buf], 16 \n" - "2: \n" - ".set pop \n" - :[y_buf] "+r" (y_buf), - [u_buf] "+r" (u_buf), - [v_buf] "+r" (v_buf), - [width] "+r" (width), - [rgb_buf] "+r" (rgb_buf) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - int y0_fraction = 256 - source_y_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "replv.ph $t0, %[y0_fraction] \n" - "replv.ph $t1, %[source_y_fraction] \n" - - "1: \n" - "lw $t2, 0(%[src_ptr]) \n" - "lw $t3, 0(%[src_ptr1]) \n" - "lw $t4, 4(%[src_ptr]) \n" - "lw $t5, 4(%[src_ptr1]) \n" - "muleu_s.ph.qbl $t6, $t2, $t0 \n" - "muleu_s.ph.qbr $t7, $t2, $t0 \n" - "muleu_s.ph.qbl $t8, $t3, $t1 \n" - "muleu_s.ph.qbr $t9, $t3, $t1 \n" - "muleu_s.ph.qbl $t2, $t4, $t0 \n" - "muleu_s.ph.qbr $t3, $t4, $t0 \n" - "muleu_s.ph.qbl $t4, $t5, $t1 \n" - "muleu_s.ph.qbr $t5, $t5, $t1 \n" - "addq.ph $t6, $t6, $t8 \n" - "addq.ph $t7, $t7, $t9 \n" - "addq.ph $t2, $t2, $t4 \n" - "addq.ph $t3, $t3, $t5 \n" - "shra.ph $t6, $t6, 8 \n" - "shra.ph $t7, $t7, 8 \n" - "shra.ph $t2, $t2, 8 \n" - "shra.ph $t3, $t3, 8 \n" - "precr.qb.ph $t6, $t6, $t7 \n" - "precr.qb.ph $t2, $t2, $t3 \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[src_ptr1], %[src_ptr1], 8 \n" - "addiu %[dst_width], %[dst_width], -8 \n" - "sw $t6, 0(%[dst_ptr]) \n" - "sw $t2, 4(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[dst_ptr], %[dst_ptr], 8 \n" - - ".set pop \n" - : [dst_ptr] "+r" (dst_ptr), - [src_ptr1] "+r" (src_ptr1), - [src_ptr] "+r" (src_ptr), - [dst_width] "+r" (dst_width) - : [source_y_fraction] "r" (source_y_fraction), - [y0_fraction] "r" (y0_fraction), - [src_stride] "r" (src_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} -#endif // __mips_dsp_rev >= 2 - -#endif // defined(__mips__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc new file mode 100644 index 00000000..f79de1c7 --- /dev/null +++ b/files/source/row_msa.cc @@ -0,0 +1,2977 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <string.h> + +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ + bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ + br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + } + +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64 y_m; \ + uint32 u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ + } + +// Clip input vector elements between 0 to 255 +#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ + { \ + v4i32 max_m = __msa_ldi_w(0xFF); \ + \ + in0 = __msa_maxi_s_w(in0, 0); \ + in1 = __msa_maxi_s_w(in1, 0); \ + in2 = __msa_maxi_s_w(in2, 0); \ + in3 = __msa_maxi_s_w(in3, 0); \ + in4 = __msa_maxi_s_w(in4, 0); \ + in5 = __msa_maxi_s_w(in5, 0); \ + in0 = __msa_min_s_w(max_m, in0); \ + in1 = __msa_min_s_w(max_m, in1); \ + in2 = __msa_min_s_w(max_m, in2); \ + in3 = __msa_min_s_w(max_m, in3); \ + in4 = __msa_min_s_w(max_m, in4); \ + in5 = __msa_min_s_w(max_m, in5); \ + } + +// Convert 8 pixels of YUV 420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m - reg5_m; \ + reg6_m = reg1_m - reg6_m; \ + reg2_m = reg0_m - reg2_m; \ + reg3_m = reg1_m - reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m += bb; \ + reg6_m += bb; \ + reg7_m += bg; \ + reg4_m += bg; \ + reg2_m += br; \ + reg3_m += br; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ + { \ + v8i16 vec0_m, vec1_m; \ + v16u8 dst0_m, dst1_m; \ + vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ + dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ + ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ + } + +// Takes ARGB input and calculates Y. +#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ + y_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8u16 reg0_m, reg1_m; \ + \ + vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ + vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ + vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ + vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const0); \ + reg1_m = __msa_dotp_u_h(vec1_m, const0); \ + reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ + reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ + reg0_m += const2; \ + reg1_m += const2; \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ + y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Loads current and next row of ARGB input and averages it to calculate U and V +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ + { \ + v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v16u8 vec8_m, vec9_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ + v8u16 reg8_m, reg9_m; \ + \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ + vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ + reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ + reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ + reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const1); \ + reg1_m = __msa_dotp_u_h(vec1_m, const1); \ + reg2_m = __msa_dotp_u_h(vec4_m, const1); \ + reg3_m = __msa_dotp_u_h(vec5_m, const1); \ + reg0_m += const3; \ + reg1_m += const3; \ + reg2_m += const3; \ + reg3_m += const3; \ + reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ + v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + } + +// Load I444 pixel data +#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64 y_m, u_m, v_m; \ + v2i64 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LD(psrc_u); \ + v_m = LD(psrc_v); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \ + } + +void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + src += width - 64; + + for (x = 0; x < width; x += 64) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + src += width * 4 - 64; + + for (x = 0; x < width; x += 16) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + src_y += 8; + src_u += 4; + src_v += 4; + rgb_buf += 32; + } +} + +void I422ToRGBARow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(alpha, vec0, vec1, vec2, rgb_buf); + src_y += 8; + src_u += 4; + src_v += 4; + rgb_buf += 32; + } +} + +void I422AlphaToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64 data_a; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v4i32 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); + STOREARGB(vec0, vec1, vec2, src3, rgb_buf); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + rgb_buf += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int32 width) { + int x; + int64 data_u, data_v; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = {0}; + v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; + v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; + v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, + 11, 29, 12, 13, 30, 14, 15, 31}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8)__msa_insert_d(zero, 0, data_u); + src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec3, vec4, vec5); + reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); + reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); + reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); + ST_UB2(dst0, dst1, rgb_buf, 16); + ST_UB(dst2, (rgb_buf + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + rgb_buf += 48; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec2, vec1); + vec0 = __msa_srai_h(vec0, 3); + vec1 = __msa_srai_h(vec1, 3); + vec2 = __msa_srai_h(vec2, 2); + vec1 = __msa_slli_h(vec1, 11); + vec2 = __msa_slli_h(vec2, 5); + vec0 |= vec1; + dst0 = (v16u8)(vec2 | vec0); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_u += 4; + src_v += 4; + dst_rgb565 += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 4); + reg1 = (v8u16)__msa_srai_h(vec1, 4); + reg2 = (v8u16)__msa_srai_h(vec2, 4); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); + reg1 |= const_0xF000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb4444); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb4444 += 16; + } +} + +void I422ToARGB1555Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 3); + reg1 = (v8u16)__msa_srai_h(vec1, 3); + reg2 = (v8u16)__msa_srai_h(vec2, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); + reg1 |= const_0x8000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb1555); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb1555 += 16; + } +} + +void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8* src_yuy2, + int src_stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8* src_uyvy, + int src_stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { + const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16i8 zero = {0}; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); + reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); + reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); + reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVRow_MSA(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* src_argb0_next = src_argb0 + src_stride_argb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v16u8 dst0, dst1; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 = __msa_hadd_u_h(vec8, vec8); + reg1 = __msa_hadd_u_h(vec9, vec9); + reg2 = __msa_hadd_u_h(vec4, vec4); + reg3 = __msa_hadd_u_h(vec5, vec5); + reg4 = __msa_hadd_u_h(vec0, vec0); + reg5 = __msa_hadd_u_h(vec1, vec1); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 += __msa_hadd_u_h(vec8, vec8); + reg1 += __msa_hadd_u_h(vec9, vec9); + reg2 += __msa_hadd_u_h(vec4, vec4); + reg3 += __msa_hadd_u_h(vec5, vec5); + reg4 += __msa_hadd_u_h(vec0, vec0); + reg5 += __msa_hadd_u_h(vec1, vec1); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg6 = reg0 * const_0x70; + reg7 = reg1 * const_0x70; + reg8 = reg2 * const_0x4A; + reg9 = reg3 * const_0x4A; + reg6 += const_0x8080; + reg7 += const_0x8080; + reg8 += reg4 * const_0x26; + reg9 += reg5 * const_0x26; + reg0 *= const_0x12; + reg1 *= const_0x12; + reg2 *= const_0x5E; + reg3 *= const_0x5E; + reg4 *= const_0x70; + reg5 *= const_0x70; + reg2 += reg0; + reg3 += reg1; + reg4 += const_0x8080; + reg5 += const_0x8080; + reg6 -= reg8; + reg7 -= reg9; + reg4 -= reg2; + reg5 -= reg3; + reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); + reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb0 += 128; + src_argb0_next += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); + vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); + vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec1 = __msa_binsli_b(vec2, vec3, 4); + vec4 = __msa_binsli_b(vec4, vec5, 2); + vec5 = __msa_binsli_b(vec6, vec7, 4); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); + vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); + vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); + vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); + vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); + vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); + vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); + vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec5 = __msa_binsli_b(vec5, vec6, 2); + vec1 = __msa_binsli_b(vec2, vec3, 5); + vec6 = __msa_binsli_b(vec7, vec8, 5); + vec1 = __msa_binsli_b(vec1, vec4, 0); + vec6 = __msa_binsli_b(vec6, vec9, 0); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1; + v16u8 vec0, vec1; + v16u8 dst0; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); + vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); + src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); + src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); + vec0 = __msa_binsli_b(vec0, src0, 3); + vec1 = __msa_binsli_b(vec1, src1, 3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToUV444Row_MSA(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, + int32 width) { + int32 x; + v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11; + v8u16 const_112 = (v8u16)__msa_ldi_h(112); + v8u16 const_74 = (v8u16)__msa_ldi_h(74); + v8u16 const_38 = (v8u16)__msa_ldi_h(38); + v8u16 const_94 = (v8u16)__msa_ldi_h(94); + v8u16 const_18 = (v8u16)__msa_ldi_h(18); + v8u16 const_32896 = (v8u16)__msa_fill_h(32896); + v16i8 zero = {0}; + + for (x = width; x > 0; x -= 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec10 = vec0 * const_18; + vec11 = vec1 * const_18; + vec8 = vec2 * const_94; + vec9 = vec3 * const_94; + vec6 = vec4 * const_112; + vec7 = vec5 * const_112; + vec0 *= const_112; + vec1 *= const_112; + vec2 *= const_74; + vec3 *= const_74; + vec4 *= const_38; + vec5 *= const_38; + vec8 += vec10; + vec9 += vec11; + vec6 += const_32896; + vec7 += const_32896; + vec0 += const_32896; + vec1 += const_32896; + vec2 += vec4; + vec3 += vec5; + vec0 -= vec2; + vec1 -= vec3; + vec6 -= vec8; + vec7 -= vec9; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); + vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBMultiplyRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 zero = {0}; + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); + vec4 = (v8u16)__msa_fill_h(vec0[3]); + vec5 = (v8u16)__msa_fill_h(vec0[7]); + vec6 = (v8u16)__msa_fill_h(vec1[3]); + vec7 = (v8u16)__msa_fill_h(vec1[7]); + vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec6 = (v8u16)__msa_fill_h(vec2[3]); + vec7 = (v8u16)__msa_fill_h(vec2[7]); + vec8 = (v8u16)__msa_fill_h(vec3[3]); + vec9 = (v8u16)__msa_fill_h(vec3[7]); + vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); + reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); + reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); + reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); + reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); + reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); + reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); + reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, src0, mask); + dst1 = __msa_bmnz_v(dst1, src1, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, + uint8* dst_rgb, + uint32 dither4, + int width) { + int x; + v16u8 src0, src1, dst0, vec0, vec1; + v8i16 vec_d0; + v8i16 reg0, reg1, reg2; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(0xFF); + + vec_d0 = (v8i16)__msa_fill_w(dither4); + vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); + reg0 += vec_d0; + reg1 += vec_d0; + reg2 += vec_d0; + reg0 = __msa_maxi_s_h((v8i16)reg0, 0); + reg1 = __msa_maxi_s_h((v8i16)reg1, 0); + reg2 = __msa_maxi_s_h((v8i16)reg2, 0); + reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); + reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); + reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); + reg0 = __msa_srai_h(reg0, 3); + reg2 = __msa_srai_h(reg2, 3); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_slli_h(reg2, 11); + reg1 = __msa_slli_h(reg1, 5); + reg0 |= reg1; + dst0 = (v16u8)(reg0 | reg2); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_MSA(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v16i8 vec0; + v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + int32 val = LW((int32*)shuffler); + + vec0 = (v16i8)__msa_fill_w(val); + shuffler_vec += vec0; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_MSA(const uint8* src_argb, + uint8* dst_argb, + int width, + uint32 value) { + int x; + v16u8 src0, dst0; + v8u16 vec0, vec1; + v4u32 reg0, reg1, reg2, reg3, rgba_scale; + v8i16 zero = {0}; + + rgba_scale[0] = value; + rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); + rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= rgba_scale; + reg1 *= rgba_scale; + reg2 *= rgba_scale; + reg3 *= rgba_scale; + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0, dst1; + v8u16 reg0; + v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2; + v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); + v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); + v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); + v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); + v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); + v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); + v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); + reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); + reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); + reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); + reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); + reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); + reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); + reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); + reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); + vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); + vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); + vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); + ST_UB2(dst0, dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1; + v8u16 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); + vec0 = (v8u16)__msa_andi_b(src0, 0x0F); + vec1 = (v8u16)__msa_andi_b(src1, 0x0F); + vec2 = (v8u16)__msa_andi_b(src0, 0xF0); + vec3 = (v8u16)__msa_andi_b(src1, 0xF0); + vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); + vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); + vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); + vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb4444 += 32; + dst_argb += 64; + } +} + +void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, + uint8* dst_argb, + int width) { + int x; + v8u16 src0, src1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; + v16u8 dst0, dst1, dst2, dst3; + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); + reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); + reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); + reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); + reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); + reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); + reg3 = -reg3; + reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); + reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); + reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); + reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb1555 += 32; + dst_argb += 64; + } +} + +void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); + res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); + res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb565 += 32; + dst_argb += 64; + } +} + +void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb24 += 48; + dst_argb += 64; + } +} + +void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_raw += 48; + dst_argb += 64; + } +} + +void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 dst0; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); + reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); + reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); + reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); + reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); + reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); + reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb1555 += 32; + dst_y += 16; + } +} + +void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v4u32 res0, res1, res2, res3; + v16u8 dst0; + v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); + v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); + v8i16 const_0x1080 = __msa_fill_h(0x1080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); + vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); + vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); + vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); + vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); + vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); + vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); + res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); + res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); + res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); + res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); + res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); + res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); + res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); + res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); + res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); + res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); + res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); + res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); + vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); + vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_rgb565 += 32; + dst_y += 16; + } +} + +void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); + v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); + v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint16* s = (const uint16*)src_argb1555; + const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + vec2 += src2 & const_0x1F; + vec3 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); + vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); + reg0 = vec6 * const_0x70; + reg1 = vec0 * const_0x4A; + reg2 = vec2 * const_0x70; + reg3 = vec0 * const_0x5E; + reg0 += const_0x8080; + reg1 += vec2 * const_0x26; + reg2 += const_0x8080; + reg3 += vec6 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB565ToUVRow_MSA(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint16* s = (const uint16*)src_rgb565; + const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x3F; + vec3 = src1 & const_0x3F; + vec2 += src2 & const_0x3F; + vec3 += src3 & const_0x3F; + vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + reg0 = vec3 * const_0x70; + reg1 = vec1 * const_0x4A; + reg2 = vec4 * const_0x70; + reg3 = vec1 * const_0x5E; + reg0 += const_32896; + reg1 += vec4 * const_0x26; + reg2 += const_32896; + reg3 += vec3 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB24ToUVRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + int64 res0, res1; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h((v8i16)reg0, 2); + reg1 = __msa_srai_h((v8i16)reg1, 2); + reg2 = __msa_srai_h((v8i16)reg2, 2); + reg3 = __msa_srai_h((v8i16)reg3, 2); + vec4 = (v8u16)__msa_pckev_h(reg1, reg0); + vec5 = (v8u16)__msa_pckev_h(reg3, reg2); + vec6 = (v8u16)__msa_pckod_h(reg1, reg0); + vec7 = (v8u16)__msa_pckod_h(reg3, reg2); + vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void RAWToUVRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + int64 res0, res1; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h(reg0, 2); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_srai_h(reg2, 2); + reg3 = __msa_srai_h(reg3, 2); + vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void NV12ToARGBRow_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_uv += 8; + rgb_buf += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, rgb_buf); + src_y += 8; + src_uv += 8; + rgb_buf += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_vu += 8; + rgb_buf += 32; + } +} + +void SobelRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); + v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); + v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); + v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); + v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVJRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); + v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); + v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec0 = __msa_aver_u_b(src4, src6); + vec1 = __msa_aver_u_b(src5, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec2 = __msa_aver_u_b(src4, src6); + vec3 = __msa_aver_u_b(src5, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void BGRAToUVRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void ABGRToUVRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); + v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, src0, src1, src2, src3); + ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void RGBAToUVRow_MSA(const uint8* src_rgb0, + int src_stride_rgb, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + const uint8* s = src_rgb0; + const uint8* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void I444ToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0, dst1; + v8u16 vec0, vec1, vec2; + v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + + for (x = 0; x < width; x += 8) { + READI444(src_y, src_u, src_v, src0, src1, src2); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg4 = reg0 + vec_br; + reg5 = reg1 + vec_br; + reg2 = reg0 + vec_bg; + reg3 = reg1 + vec_bg; + reg0 += vec_bb; + reg1 += vec_bb; + vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); + reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + reg0 -= reg6 * vec_ub; + reg1 -= reg7 * vec_ub; + reg2 -= reg6 * vec_ug; + reg3 -= reg7 * vec_ug; + reg4 -= reg8 * vec_vr; + reg5 -= reg9 * vec_vr; + reg2 -= reg8 * vec_vg; + reg3 -= reg9 * vec_vg; + reg0 = __msa_srai_w(reg0, 6); + reg1 = __msa_srai_w(reg1, 6); + reg2 = __msa_srai_w(reg2, 6); + reg3 = __msa_srai_w(reg3, 6); + reg4 = __msa_srai_w(reg4, 6); + reg5 = __msa_srai_w(reg5, 6); + CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); + dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_u += 8; + src_v += 8; + rgb_buf += 32; + } +} + +void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) { + int x; + v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1; + v4i32 reg0, reg1, reg2, reg3; + v4i32 vec_yg = __msa_fill_w(0x4A35); + v8i16 vec_ygb = __msa_fill_h(0xFB78); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 max = __msa_ldi_h(0xFF); + v8i16 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h(zero, vec0); + reg1 = (v4i32)__msa_ilvl_h(zero, vec0); + reg2 = (v4i32)__msa_ilvr_h(zero, vec1); + reg3 = (v4i32)__msa_ilvl_h(zero, vec1); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg2 *= vec_yg; + reg3 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg2 = __msa_srai_w(reg2, 16); + reg3 = __msa_srai_w(reg3, 16); + vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 += vec_ygb; + vec1 += vec_ygb; + vec0 = __msa_srai_h(vec0, 6); + vec1 = __msa_srai_h(vec1, 6); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); + res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); + res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); + res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16); + src_y += 16; + rgb_buf += 64; + } +} + +void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) { + int x; + v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void YUY2ToARGBRow_MSA(const uint8* src_yuy2, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0); + src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + src_yuy2 += 16; + rgb_buf += 32; + } +} + +void UYVYToARGBRow_MSA(const uint8* src_uyvy, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0); + src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + src_uyvy += 16; + rgb_buf += 32; + } +} + +void InterpolateRow_MSA(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int width, + int32 source_y_fraction) { + int32 y1_fraction = source_y_fraction; + int32 y0_fraction = 256 - y1_fraction; + uint16 y_fractions; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, y_frac; + + if (0 == y1_fraction) { + memcpy(dst_ptr, src_ptr, width); + return; + } + + if (128 == y1_fraction) { + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + dst0 = __msa_aver_u_b(src0, src2); + dst1 = __msa_aver_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16)(y0_fraction + (y1_fraction << 8)); + y_frac = (v8u16)__msa_fill_h(y_fractions); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); + vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); + vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); + vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_w(v32); + + for (x = 0; x < width; x += 4) { + ST_UB(dst0, dst_argb); + dst_argb += 16; + } +} + +void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { + int x; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; + v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, + 18, 17, 16, 21, 20, 19, 24, 23}; + v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, + 24, 23, 28, 27, 26, 31, 30, 29}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); + src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); + ST_UB2(dst0, dst1, dst_rgb24, 16); + ST_UB(dst2, (dst_rgb24 + 32)); + src_raw += 48; + dst_rgb24 += 48; + } +} + +void MergeUVRow_MSA(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); + ST_UB2(dst0, dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc index 909df060..bed14e07 100644 --- a/files/source/row_neon.cc +++ b/files/source/row_neon.cc @@ -10,6 +10,8 @@ #include "libyuv/row.h" +#include <stdio.h> + #ifdef __cplusplus namespace libyuv { extern "C" { @@ -20,29 +22,18 @@ extern "C" { !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ +#define READYUV422 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ MEMACCESS(1) \ "vld1.32 {d2[0]}, [%1]! \n" \ MEMACCESS(2) \ "vld1.32 {d2[1]}, [%2]! \n" -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.16 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.16 {d2[1]}, [%2]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d2, d3 \n" - // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ +#define READYUV444 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ MEMACCESS(2) \ @@ -51,15 +42,15 @@ extern "C" { "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" +#define READYUV400 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ +#define READNV12 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ @@ -67,9 +58,9 @@ extern "C" { "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ +#define READNV21 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ @@ -77,25 +68,25 @@ extern "C" { "vtrn.u32 d2, d3 \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "vld2.8 {d0, d2}, [%0]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READYUY2 \ + MEMACCESS(0) \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" - -#define YUVTORGB_SETUP \ - MEMACCESS([kUVToRB]) \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ +#define READUYVY \ + MEMACCESS(0) \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUVTORGB_SETUP \ + MEMACCESS([kUVToRB]) \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ MEMACCESS([kUVToG]) \ "vld1.8 {d25}, [%[kUVToG]] \n" \ MEMACCESS([kUVBiasBGR]) \ @@ -107,32 +98,32 @@ extern "C" { MEMACCESS([kYToRgb]) \ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" -#define YUVTORGB \ - "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ - "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ - "vmovl.u8 q0, d0 \n" /* Y */\ - "vmovl.s16 q10, d1 \n" \ - "vmovl.s16 q0, d0 \n" \ - "vmul.s32 q10, q10, q15 \n" \ - "vmul.s32 q0, q0, q15 \n" \ - "vqshrun.s32 d0, q0, #16 \n" \ - "vqshrun.s32 d1, q10, #16 \n" /* Y */\ - "vadd.s16 d18, d19 \n" \ - "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\ - "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\ - "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\ - "vaddw.u16 q1, q1, d16 \n" \ - "vaddw.u16 q10, q10, d17 \n" \ - "vaddw.u16 q3, q3, d18 \n" \ - "vqadd.s16 q8, q0, q13 \n" /* B */ \ - "vqadd.s16 q9, q0, q14 \n" /* R */ \ - "vqadd.s16 q0, q0, q4 \n" /* G */ \ - "vqadd.s16 q8, q8, q1 \n" /* B */ \ - "vqadd.s16 q9, q9, q10 \n" /* R */ \ - "vqsub.s16 q0, q0, q3 \n" /* G */ \ - "vqshrun.s16 d20, q8, #6 \n" /* B */ \ - "vqshrun.s16 d22, q9, #6 \n" /* R */ \ - "vqshrun.s16 d21, q0, #6 \n" /* G */ +#define YUVTORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ + "vmovl.u8 q0, d0 \n" /* Y */ \ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -227,36 +218,6 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV411 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -316,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ); } -#define ARGBTORGB565 \ - "vshll.u8 q0, d22, #8 \n" /* R */ \ - "vshll.u8 q8, d21, #8 \n" /* G */ \ - "vshll.u8 q9, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #5 \n" /* RG */ \ - "vsri.16 q0, q9, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "vshll.u8 q0, d22, #8 \n" /* R */ \ + "vshll.u8 q8, d21, #8 \n" /* G */ \ + "vshll.u8 q9, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #5 \n" /* RG */ \ + "vsri.16 q0, q9, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, @@ -353,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ); } -#define ARGBTOARGB1555 \ - "vshll.u8 q0, d23, #8 \n" /* A */ \ - "vshll.u8 q8, d22, #8 \n" /* R */ \ - "vshll.u8 q9, d21, #8 \n" /* G */ \ - "vshll.u8 q10, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #1 \n" /* AR */ \ - "vsri.16 q0, q9, #6 \n" /* ARG */ \ - "vsri.16 q0, q10, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "vshll.u8 q0, d23, #8 \n" /* A */ \ + "vshll.u8 q8, d22, #8 \n" /* R */ \ + "vshll.u8 q9, d21, #8 \n" /* G */ \ + "vshll.u8 q10, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #1 \n" /* AR */ \ + "vsri.16 q0, q9, #6 \n" /* ARG */ \ + "vsri.16 q0, q10, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, @@ -393,14 +354,14 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ); } -#define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, @@ -434,9 +395,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" @@ -459,9 +418,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, ); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d23, #255 \n" "1: \n" @@ -618,7 +575,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -640,7 +599,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width) { asm volatile ( "1: \n" @@ -737,7 +698,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ); } -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( // Start at end of source row. @@ -844,17 +807,17 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { ); } -#define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile ( @@ -875,34 +838,35 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { ); } -#define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ \ +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ - -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, + uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha @@ -922,17 +886,18 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ); } -#define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, + uint8* dst_argb, int width) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha @@ -1021,7 +986,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { ); } -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -1042,7 +1009,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -1063,8 +1032,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_NEON(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" @@ -1090,8 +1062,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_NEON(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" @@ -1118,8 +1093,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { asm volatile ( MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // shuffler @@ -1143,7 +1120,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width) { + uint8* dst_yuy2, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1169,7 +1147,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y, void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width) { + uint8* dst_uyvy, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1210,8 +1189,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { ); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width) { asm volatile ( "vdup.32 d2, %2 \n" // dither4 "1: \n" @@ -1233,7 +1214,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, ); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8* src_argb, + uint8* dst_argb1555, int width) { asm volatile ( "1: \n" @@ -1252,7 +1234,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8* src_argb, + uint8* dst_argb4444, int width) { asm volatile ( "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. @@ -1341,7 +1324,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient @@ -1381,85 +1366,31 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ); } -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. - MEMACCESS(0) - "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. - "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. - - "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. - "vpadd.u16 d1, d8, d9 \n" // B - "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. - "vpadd.u16 d3, d10, d11 \n" // G - "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. - "vpadd.u16 d5, d12, d13 \n" // R - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %3, %3, #32 \n" // 32 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB \ + ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG \ + ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR \ + ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR \ + ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG \ + ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB \ + ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1507,8 +1438,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient @@ -1555,8 +1489,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { +void BGRAToUVRow_NEON(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1603,8 +1540,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { +void ABGRToUVRow_NEON(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1651,8 +1591,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { +void RGBAToUVRow_NEON(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1699,8 +1642,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { +void RGB24ToUVRow_NEON(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1747,8 +1693,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { +void RAWToUVRow_NEON(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1796,8 +1745,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { +void RGB565ToUVRow_NEON(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1865,8 +1817,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1934,8 +1889,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, + uint8* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -2215,8 +2173,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; asm volatile ( "cmp %4, #0 \n" @@ -2280,8 +2240,10 @@ void InterpolateRow_NEON(uint8* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( "subs %3, #8 \n" "blt 89f \n" @@ -2371,8 +2333,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_NEON(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { asm volatile ( "vdup.u16 q8, %2 \n" "vshr.u16 q8, q8, #1 \n" // scale >>= 1 @@ -2414,7 +2379,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_NEON(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value) { asm volatile ( "vdup.u32 q0, %3 \n" // duplicate scale value. @@ -2523,8 +2490,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width) { asm volatile ( MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. @@ -2584,8 +2553,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2616,8 +2587,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2642,8 +2615,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2672,8 +2647,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. @@ -2699,8 +2676,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { asm volatile ( // 16 pixel loop. "1: \n" @@ -2727,8 +2706,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. @@ -2755,8 +2736,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +void SobelXRow_NEON(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -2798,8 +2782,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_NEON(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -2835,7 +2821,63 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "q0", "q1" // Clobber List ); } -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + "vdup.32 q0, %3 \n" + + "1: \n" + MEMACCESS(0) + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, q0 \n" // adjust exponent + "vmul.f32 q3, q3, q0 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(1.9259299444e-34f) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// TODO(fbarchard): multiply by element. +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vdup.32 q0, %3 \n" + + "1: \n" + MEMACCESS(0) + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, q0 \n" // adjust exponent + "vmul.f32 q3, q3, q0 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus } // extern "C" diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc index 6375d4f5..ebd685e4 100644 --- a/files/source/row_neon64.cc +++ b/files/source/row_neon64.cc @@ -19,28 +19,18 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ +#define READYUV422 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ "ld1 {v1.s}[0], [%1], #4 \n" \ MEMACCESS(2) \ "ld1 {v1.s}[1], [%2], #4 \n" -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.h}[0], [%1], #2 \n" \ - MEMACCESS(2) \ - "ld1 {v2.h}[1], [%2], #2 \n" \ - "zip1 v1.8b, v2.8b, v2.8b \n" - // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ +#define READYUV444 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ "ld1 {v1.d}[0], [%1], #8 \n" \ MEMACCESS(2) \ @@ -49,15 +39,15 @@ extern "C" { "rshrn v1.8b, v1.8h, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "movi v1.8b , #128 \n" +#define READYUV400 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ +#define READNV12 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ "ld1 {v2.8b}, [%1], #8 \n" \ "uzp1 v1.8b, v2.8b, v2.8b \n" \ @@ -65,9 +55,9 @@ extern "C" { "ins v1.s[1], v3.s[0] \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ +#define READNV21 \ + MEMACCESS(0) \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ "ld1 {v2.8b}, [%1], #8 \n" \ "uzp1 v3.8b, v2.8b, v2.8b \n" \ @@ -75,57 +65,65 @@ extern "C" { "ins v1.s[1], v3.s[0] \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ - "uzp2 v3.8b, v1.8b, v1.8b \n" \ - "uzp1 v1.8b, v1.8b, v1.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READYUY2 \ + MEMACCESS(0) \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ - "orr v0.8b, v3.8b, v3.8b \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" - -#define YUVTORGB_SETUP \ - "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" - -#define YUVTORGB(vR, vG, vB) \ - "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ - "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ - "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ - "ushll v0.4s, v0.4h, #0 \n" \ - "mul v3.4s, v3.4s, v31.4s \n" \ - "mul v0.4s, v0.4s, v31.4s \n" \ - "sqshrun v0.4h, v0.4s, #16 \n" \ - "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ - "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ - "uxtl v2.8h, v2.8b \n" \ - "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v1.8h, v27.8h \n" \ - "mul v5.8h, v1.8h, v29.8h \n" \ - "mul v6.8h, v2.8h, v30.8h \n" \ - "mul v7.8h, v2.8h, v28.8h \n" \ - "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ - "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ +#define READUYVY \ + MEMACCESS(0) \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" + +#define YUVTORGB_SETUP \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" + +#define YUVTORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB \ + ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG \ + ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR \ + ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB \ + ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG \ + ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR \ + ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB \ + ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG \ + ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -220,36 +218,6 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV411 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - MEMACCESS(3) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} - void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -309,12 +277,12 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ); } -#define ARGBTORGB565 \ - "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v21.8h, #5 \n" /* RG */ \ - "sri v0.8h, v20.8h, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, @@ -346,14 +314,14 @@ void I422ToRGB565Row_NEON(const uint8* src_y, ); } -#define ARGBTOARGB1555 \ - "shll v0.8h, v23.8b, #8 \n" /* A */ \ - "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v22.8h, #1 \n" /* AR */ \ - "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v20.8h, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, @@ -386,15 +354,15 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, ); } -#define ARGBTOARGB4444 \ - /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ - "ushr v20.8b, v20.8b, #4 \n" /* B */ \ - "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ - "ushr v22.8b, v22.8b, #4 \n" /* R */ \ - "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ - "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ - "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ - "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, @@ -428,9 +396,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -453,9 +419,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, ); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( "movi v23.8b, #255 \n" "1: \n" @@ -612,7 +576,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -634,7 +600,9 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, int width) { asm volatile ( "1: \n" @@ -728,7 +696,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ); } -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_NEON(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( // Start at end of source row. @@ -834,18 +804,18 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { ); } -#define RGB565TOARGB \ - "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ - "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ - "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ - "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ - "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ - "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ - "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ - "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ - "dup v2.2D, v0.D[1] \n" /* R */ +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile ( @@ -866,44 +836,45 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { ); } -#define ARGB1555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ - \ - "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ - "xtn2 v3.16b, v2.8h \n" \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ - "dup v1.2D, v0.D[1] \n" \ - "dup v3.2D, v2.D[1] \n" +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ - "dup v1.2D, v0.D[1] \n" /* G */ \ - -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, + uint8* dst_argb, int width) { asm volatile ( "movi v3.8b, #255 \n" // Alpha @@ -923,19 +894,20 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, ); } -#define ARGB4444TOARGB \ - "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ - "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ - "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ - "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ - "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ - "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ - "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ - "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ - "dup v0.2D, v2.D[1] \n" \ - "dup v1.2D, v3.D[1] \n" +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, + uint8* dst_argb, int width) { asm volatile ( "1: \n" @@ -1024,7 +996,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { ); } -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -1045,7 +1019,9 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "1: \n" @@ -1066,8 +1042,11 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_NEON(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile ( "1: \n" @@ -1094,8 +1073,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_NEON(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile ( "1: \n" @@ -1123,8 +1105,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { asm volatile ( MEMACCESS(3) "ld1 {v2.16b}, [%3] \n" // shuffler @@ -1147,7 +1131,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_yuy2, int width) { + uint8* dst_yuy2, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1174,7 +1159,8 @@ void I422ToYUY2Row_NEON(const uint8* src_y, void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_uyvy, int width) { + uint8* dst_uyvy, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1216,8 +1202,10 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { ); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width) { asm volatile ( "dup v1.4s, %w2 \n" // dither4 "1: \n" @@ -1239,7 +1227,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, ); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8* src_argb, + uint8* dst_argb1555, int width) { asm volatile ( "1: \n" @@ -1258,7 +1247,8 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, ); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8* src_argb, + uint8* dst_argb4444, int width) { asm volatile ( "movi v4.16b, #0x0f \n" // bits to clear with vbic. @@ -1346,7 +1336,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8* src_argb, + uint8* dst_u, + uint8* dst_v, int width) { asm volatile ( "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient @@ -1387,83 +1379,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ); } -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. - "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. - "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. - "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w3, %w3, #32 \n" // 32 processed per loop. - "mul v3.8h, v0.8h, v20.8h \n" // B - "mls v3.8h, v1.8h, v21.8h \n" // G - "mls v3.8h, v2.8h, v22.8h \n" // R - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "mul v4.8h, v2.8h, v20.8h \n" // R - "mls v4.8h, v1.8h, v24.8h \n" // G - "mls v4.8h, v0.8h, v23.8h \n" // B - "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB \ + ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR \ + ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG \ + ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG \ + ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR \ + ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB \ + ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG @@ -1503,8 +1453,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_NEON(const uint8* src_argb, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 @@ -1547,8 +1500,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { +void BGRAToUVRow_NEON(const uint8* src_bgra, + int src_stride_bgra, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG @@ -1586,8 +1542,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { +void ABGRToUVRow_NEON(const uint8* src_abgr, + int src_stride_abgr, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG @@ -1625,8 +1584,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { +void RGBAToUVRow_NEON(const uint8* src_rgba, + int src_stride_rgba, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG @@ -1664,8 +1626,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { +void RGB24ToUVRow_NEON(const uint8* src_rgb24, + int src_stride_rgb24, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG @@ -1703,8 +1668,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { +void RAWToUVRow_NEON(const uint8* src_raw, + int src_stride_raw, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG @@ -1743,8 +1711,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { +void RGB565ToUVRow_NEON(const uint8* src_rgb565, + int src_stride_rgb565, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile ( "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 @@ -1817,8 +1788,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile ( RGBTOUV_SETUP_REG @@ -1886,8 +1860,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, + uint8* dst_v, + int width) { const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile ( RGBTOUV_SETUP_REG @@ -2169,8 +2146,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; @@ -2235,8 +2214,10 @@ void InterpolateRow_NEON(uint8* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( "subs %w3, %w3, #8 \n" "b.lt 89f \n" @@ -2331,8 +2312,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_NEON(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { asm volatile ( "dup v4.8h, %w2 \n" "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 @@ -2374,7 +2358,9 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, +void ARGBShadeRow_NEON(const uint8* src_argb, + uint8* dst_argb, + int width, uint32 value) { asm volatile ( "dup v0.4s, %w3 \n" // duplicate scale value. @@ -2484,8 +2470,10 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_NEON(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width) { asm volatile ( MEMACCESS(3) "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. @@ -2546,8 +2534,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2578,8 +2568,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2606,8 +2598,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_NEON(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { asm volatile ( // 8 pixel loop. "1: \n" @@ -2638,8 +2632,10 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. @@ -2665,8 +2661,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { asm volatile ( // 16 pixel loop. "1: \n" @@ -2693,8 +2691,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_NEON(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. @@ -2721,8 +2721,11 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +void SobelXRow_NEON(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -2764,8 +2767,10 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_NEON(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width) { asm volatile ( "1: \n" MEMACCESS(0) @@ -2801,6 +2806,56 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } + +// Caveat - rounds float to half float whereas scaling version truncates. +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3" + ); +} + +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3" + ); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/row_win.cc b/files/source/row_win.cc index 2a3da896..202f2b8d 100644 --- a/files/source/row_win.cc +++ b/files/source/row_win.cc @@ -28,61 +28,60 @@ extern "C" { #if defined(_M_X64) // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm2 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ - xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ - xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm0 = _mm_adds_epi16(xmm0, xmm4); \ - xmm1 = _mm_adds_epi16(xmm1, xmm4); \ - xmm2 = _mm_adds_epi16(xmm2, xmm4); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); +#define YUVTORGB(yuvconstants) \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm2 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ + xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ + xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ + xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm0 = _mm_adds_epi16(xmm0, xmm4); \ + xmm1 = _mm_adds_epi16(xmm1, xmm4); \ + xmm2 = _mm_adds_epi16(xmm2, xmm4); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ - dst_argb += 32; - +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ + dst_argb += 32; #if defined(HAS_I422TOARGBROW_SSSE3) void I422ToARGBRow_SSSE3(const uint8* y_buf, @@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. -static const vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static const vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; -static const vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static const vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -static const vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; // Constants for BGRA. -static const vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static const vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static const vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR. -static const vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static const vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static const vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static const vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static const vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static const vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static const uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static const uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { +__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 convertloop: @@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 +__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y, + uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: @@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { } #endif // HAS_J400TOARGBROW_AVX2 -__declspec(naked) -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB @@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -__declspec(naked) -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int width) { +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB @@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, } } -__declspec(naked) -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw, + uint8* dst_rgb24, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 mov ecx, [esp + 12] // width movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 @@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, + uint8* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green psllw xmm4, 10 psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, + uint8* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green vpsllw ymm4, ymm4, 10 vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, + uint8* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, + uint8* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f vmovd xmm4, eax vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles vpsrlw ymm3, ymm2, 4 vpsllw ymm1, ymm0, 4 vpor ymm2, ymm2, ymm3 vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm2, ymm2, 0xd8 vpunpckhbw ymm1, ymm0, ymm2 vpunpcklbw ymm0, ymm0, ymm2 @@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions -__declspec(naked) -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, + uint8* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB + por xmm1, xmm2 // RB movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) pand xmm2, xmm7 - por xmm0, xmm2 // AG + por xmm0, xmm2 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, } // 18 instructions. -__declspec(naked) -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, + uint8* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f movd xmm4, eax pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles movdqa xmm1, xmm0 movdqa xmm3, xmm2 psllw xmm1, 4 @@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } } -__declspec(naked) -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes + punpcklbw xmm6, xmm6 // make dither 16 bytes movdqa xmm7, xmm6 punpcklwd xmm6, xmm6 punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, + uint8* dst_rgb, + const uint32 dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + mov ecx, [esp + 16] // width + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes vpermq ymm6, ymm6, 0xd8 vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 + movdqa xmm5, xmm4 // generate mask 0x000003e0 pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 + movdqa xmm6, xmm4 // generate mask 0x00007c00 pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 @@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble psrld xmm0, 4 psrld xmm1, 8 por xmm0, xmm1 @@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 vpslld ymm7, ymm7, 15 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA vpackssdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, + uint8* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble vpsrld ymm1, ymm1, 8 vpsrld ymm0, ymm0, 4 vpor ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToY movdqa xmm5, xmmword ptr kAddY16 @@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToY vbroadcastf128 ymm5, xmmword ptr kAddY16 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ vbroadcastf128 ymm5, xmmword ptr kAddYJ64 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kBGRAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kABGRToY movdqa xmm5, xmmword ptr kAddY16 @@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kRGBAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUVJ128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1511,8 +1500,8 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, packsswb xmm0, xmm1 // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1575,8 +1566,8 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1642,8 +1635,8 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUV444Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U + /* convert to U and V */ + movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, paddb xmm0, xmm5 movdqu [edx], xmm0 - movdqu xmm0, [eax] // V + movdqu xmm0, [eax] // V movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, + int src_stride_argb, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOYROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ +#define READYUV444_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUV422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUVA422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16] \ - } - -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 __asm { \ - __asm vmovd xmm0, dword ptr [esi] /* U */ \ - __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV12_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV21_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ +#define READYUY2_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ +#define READUYVY_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) __asm { \ +#define YUVTORGB_AVX2(YuvConstants) \ + __asm { \ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ @@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ __asm vpsubw ymm1, ymm3, ymm1 \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubw ymm0, ymm3, ymm0 \ - /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ } // Store 16 ARGB values. -#define STOREARGB_AVX2 __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ +#define STOREARGB_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} // Store 16 RGBA values. -#define STORERGBA_AVX2 __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ +#define STORERGBA_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_AVX2( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I422ALPHATOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_AVX2( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_AVX2( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV444_AVX2 YUVTORGB_AVX2(ebx) @@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV411_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I411TOARGBROW_AVX2 - #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_AVX2( + const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV12_AVX2 @@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_AVX2( + const uint8* y_buf, + const uint8* vu_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV21_AVX2 @@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_YUY2TOARGBROW_AVX2 // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_AVX2( + const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUY2_AVX2 @@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, #ifdef HAS_UYVYTOARGBROW_AVX2 // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_AVX2( + const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READUYVY_AVX2 @@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy, #ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) -void I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_AVX2( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, // Allows a conversion with half size scaling. // Read 8 UV from 444. -#define READYUV444 __asm { \ +#define READYUV444 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUV422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUVA422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8] \ - } - -// Read 2 UV from 411, upsample to 8 UV. -// drmemory fails with memory fault if pinsrw used. libyuv bug: 525 -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_EBX __asm { \ - __asm movzx ebx, word ptr [esi] /* U */ \ - __asm movd xmm0, ebx \ - __asm movzx ebx, word ptr [esi + edi] /* V */ \ - __asm movd xmm1, ebx \ - __asm lea esi, [esi + 2] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 __asm { \ +#define READNV12 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 __asm { \ +#define READNV21 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ +#define READYUY2 \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ +#define READUYVY \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) __asm { \ +#define YUVTORGB(YuvConstants) \ + __asm { \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm movdqa xmm3, xmm0 \ @@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm psubw xmm2, xmm3 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm paddsw xmm0, xmm4 /* B += Y */ \ - __asm paddsw xmm1, xmm4 /* G += Y */ \ - __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm paddsw xmm0, xmm4 /* B += Y */ \ + __asm paddsw xmm1, xmm4 /* G += Y */ \ + __asm paddsw xmm2, xmm4 /* R += Y */ \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. -#define STOREARGB __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ +#define STOREARGB \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 BGRA values. -#define STOREBGRA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ +#define STOREBGRA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGBA values. -#define STORERGBA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ +#define STORERGBA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGB24 values. -#define STORERGB24 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB24 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24] \ - } + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24]} // Store 8 RGB565 values. -#define STORERGB565 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB565 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16] \ - } + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16]} // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 @@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) -void I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB24Row_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) -void I422ToRGB565Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB565Row_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb565_buf, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 psrld xmm6, 26 pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 convertloop: @@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV422 @@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2820,62 +2744,22 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } // 8 pixels. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) -void I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov edx, [esp + 16 + 16] // abgr - mov ebp, [esp + 16 + 20] // yuvconstants - mov ecx, [esp + 16 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV411_EBX - YUVTORGB(ebp) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_SSSE3( + const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV12 @@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_SSSE3( + const uint8* y_buf, + const uint8* vu_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV21 @@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_SSSE3( + const uint8* src_yuy2, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUY2 @@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, // 8 pixels. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_SSSE3( + const uint8* src_uyvy, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READUYVY @@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, } } -__declspec(naked) -void I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_SSSE3( + const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) -void I400ToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) movd xmm3, eax pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y + punpcklbw xmm0, xmm0 // Y.Y pmulhuw xmm0, xmm2 psubusw xmm0, xmm3 psrlw xmm0, 6 - packuswb xmm0, xmm0 // G + packuswb xmm0, xmm0 // G // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG + punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 movdqu [edx], xmm0 @@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) -void I400ToARGBRow_AVX2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) vmovd xmm3, eax vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 vpslld ymm4, ymm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y vpmulhuw ymm0, ymm0, ymm2 vpsubusw ymm0, ymm0, ymm3 vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 // TODO(fbarchard): Weave alpha with unpack. // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels vpor ymm0, ymm0, ymm4 vpor ymm1, ymm1, ymm4 vmovdqu [edx], ymm0 @@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_SSSE3(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, xmmword ptr kShuffleMirror @@ -3140,11 +3022,10 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vbroadcastf128 ymm5, xmmword ptr kShuffleMirror @@ -3164,17 +3045,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm1, xmmword ptr kShuffleMirrorUV @@ -3198,11 +3079,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width lea eax, [eax - 16 + ecx * 4] // last 4 pixels. @@ -3221,15 +3103,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -__declspec(naked) -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 @@ -3246,16 +3127,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3265,10 +3147,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes + pand xmm0, xmm5 // even bytes pand xmm1, xmm5 packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes + psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 movdqu [edx], xmm0 @@ -3285,16 +3167,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3302,9 +3185,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm2, ymm0, 8 // odd bytes vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm0, ymm0, ymm5 // even bytes vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm2, ymm2, ymm3 @@ -3324,24 +3207,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - movdqu xmm0, [eax] // read 16 U's + movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs movdqu [edi], xmm0 movdqu [edi + 16], xmm2 lea edi, [edi + 32] @@ -3355,24 +3239,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, + const uint8* src_v, + uint8* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vextractf128 [edi], ymm2, 0 // bytes 0..15 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 @@ -3389,11 +3274,10 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count test eax, 15 jne convertloopu @@ -3427,11 +3311,10 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #ifdef HAS_COPYROW_AVX // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count convertloop: @@ -3451,13 +3334,12 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) -void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { mov eax, esi mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst mov ecx, [esp + 12] // count rep movsb mov edi, edx @@ -3468,15 +3350,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3504,14 +3387,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vmovdqu ymm1, [eax] @@ -3533,11 +3417,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, + uint8* dst_a, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a mov ecx, [esp + 12] // width extractloop: @@ -3558,17 +3443,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, + uint8* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3598,14 +3520,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, + uint8* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vpmovzxbd ymm1, qword ptr [eax] @@ -3630,14 +3553,13 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_SETROW_X86 // Write 'count' bytes using an 8 bit value repeated. // Count should be multiple of 4. -__declspec(naked) -void SetRow_X86(uint8* dst, uint8 v8, int count) { +__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { __asm { - movzx eax, byte ptr [esp + 8] // v8 + movzx eax, byte ptr [esp + 8] // v8 mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. + mul edx // overwrites edx with upper part of result. mov edx, edi - mov edi, [esp + 4] // dst + mov edi, [esp + 4] // dst mov ecx, [esp + 12] // count shr ecx, 2 rep stosd @@ -3647,12 +3569,11 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { } // Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) -void SetRow_ERMS(uint8* dst, uint8 v8, int count) { +__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 mov ecx, [esp + 12] // count rep stosb mov edi, edx @@ -3661,12 +3582,11 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) { } // Write 'count' 32 bit values. -__declspec(naked) -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { +__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 mov ecx, [esp + 12] // count rep stosd mov edi, edx @@ -3676,12 +3596,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { +__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 @@ -3689,9 +3610,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm0, ymm0, ymm5 // even bytes are Y vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3702,18 +3623,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { } } -__declspec(naked) -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3723,18 +3646,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3746,16 +3669,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3763,18 +3687,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3785,21 +3709,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_AVX2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm0, ymm0, 8 // odd bytes are Y vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3810,18 +3734,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3831,18 +3757,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3854,16 +3780,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3871,18 +3798,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3895,21 +3822,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) -void YUY2ToYRow_SSE2(const uint8* src_yuy2, - uint8* dst_y, int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff +__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, + int width) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y + pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -3920,18 +3847,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3943,13 +3872,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3963,16 +3892,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3980,13 +3910,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3999,19 +3929,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_SSE2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y + psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4022,18 +3952,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, + int stride_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4045,13 +3977,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4065,16 +3997,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, + uint8* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4082,13 +4015,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4108,13 +4041,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width) { __asm { push esi push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. movd xmm6, eax @@ -4123,8 +4058,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. movd xmm7, eax pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4134,15 +4069,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // 8 pixel loop. convertloop8: - movq xmm0, qword ptr [esi] // alpha + movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a + pxor xmm0, xmm5 // a, 255-a movq xmm1, qword ptr [eax + esi] // src0 movq xmm2, qword ptr [edx + esi] // src1 punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 + psubb xmm1, xmm6 // bias src0/1 - 128 pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. + paddw xmm0, xmm7 // unbias result - 32768 and round. psrlw xmm0, 8 packuswb xmm0, xmm0 movq qword ptr [edi + esi], xmm0 @@ -4163,13 +4098,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, + const uint8* src1, + const uint8* alpha, + uint8* dst, + int width) { __asm { push esi push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 vpsllw ymm5, ymm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. vmovd xmm6, eax @@ -4177,8 +4114,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. vmovd xmm7, eax vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4188,21 +4125,21 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, // 32 pixel loop. convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a vmovdqu ymm1, [eax + esi] // src0 vmovdqu ymm2, [edx + esi] // src1 vpunpckhbw ymm4, ymm1, ymm2 vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. vpsrlw ymm3, ymm3, 8 vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm3 @@ -4221,52 +4158,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 + pcmpeqb xmm7, xmm7 // generate constant 0x0001 psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 sub ecx, 4 - jl convertloop4b // less than 4 pixels? + jl convertloop4b // less than 4 pixels? // 4 pixel loop. convertloop4: - movdqu xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4278,24 +4214,24 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 1 pixel loop. convertloop1: - movd xmm3, [eax] // src argb + movd xmm3, [eax] // src argb lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4311,41 +4247,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, }; static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 movdqa xmm4, xmmword ptr kShuffleAlpha0 movdqa xmm5, xmmword ptr kShuffleAlpha1 convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha lea eax, [eax + 16] pand xmm2, xmm3 psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha + por xmm0, xmm2 // copy original alpha movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4358,22 +4295,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; -__declspec(naked) -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpshufb ymm2, ymm0, ymm4 // low 4 alphas @@ -4398,40 +4336,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width lea ebx, fixed_invtbl8 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 + punpcklbw xmm0, xmm0 // first 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a + pmulhuw xmm0, xmm2 // rgb * a - movdqu xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 + punpckhbw xmm1, xmm1 // next 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a + pmulhuw xmm1, xmm2 // rgb * a lea eax, [eax + 16] packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4450,25 +4388,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a @@ -4488,50 +4425,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } -#else // USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +#else // USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width sub edx, eax lea ebx, fixed_invtbl8 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] // end of VPGATHER - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a @@ -4540,7 +4477,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpackuswb ymm0, ymm0, ymm1 // unmutated. vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 @@ -4558,12 +4495,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -4575,20 +4513,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes + packuswb xmm0, xmm0 // 8 G bytes movdqu xmm2, [eax] // A movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -4604,24 +4542,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static const vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static const vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ movdqa xmm2, xmmword ptr kARGBToSepiaB movdqa xmm3, xmmword ptr kARGBToSepiaG movdqa xmm4, xmmword ptr kARGBToSepiaR @@ -4633,32 +4567,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values + packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 movdqu [eax], xmm0 movdqu [eax + 16], xmm1 lea eax, [eax + 32] @@ -4674,19 +4608,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const int8* matrix_argb, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ movdqu xmm5, [ecx] pshufd xmm2, xmm5, 0x00 pshufd xmm3, xmm5, 0x55 pshufd xmm4, xmm5, 0xaa pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ convertloop: movdqu xmm0, [eax] // B @@ -4697,31 +4632,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R + phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm6 lea eax, [eax + 32] @@ -4735,15 +4670,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ pshuflw xmm2, xmm2, 040h pshufd xmm2, xmm2, 044h pshuflw xmm3, xmm3, 040h @@ -4756,16 +4693,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, convertloop: movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels + punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size + pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 @@ -4780,25 +4717,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { +__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + int width, + uint32 value) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 @@ -4814,28 +4752,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4853,13 +4792,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4867,11 +4807,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4882,11 +4822,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb0 lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 + movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4901,22 +4841,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb0 - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4930,28 +4871,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4967,20 +4909,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4996,20 +4939,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5028,14 +4972,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width @@ -5045,17 +4991,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5063,7 +5009,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5084,13 +5030,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5098,17 +5045,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5116,7 +5063,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5137,36 +5084,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA por xmm0, xmm5 movdqu [edx], xmm1 movdqu [edx + 16], xmm2 @@ -5184,22 +5132,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely + paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -5217,36 +5166,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 + pcmpeqb xmm5, xmm5 // alpha 255 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA punpcklbw xmm3, xmm5 punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS + movdqa xmm4, xmm1 // YS punpcklbw xmm4, xmm2 punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 movdqu [edx], xmm6 movdqu [edx + 16], xmm4 movdqu [edx + 32], xmm7 @@ -5275,8 +5225,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // count is number of averaged pixels to produce. // Does 4 pixels at a time. // This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32* topleft, + const int32* botleft, + int width, + int area, + uint8* dst, int count) { __asm { mov eax, topleft // eax topleft @@ -5294,18 +5247,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, cmp area, 128 // 128 pixels will not overflow 15 bits. ja l4 - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 psrld xmm6, 16 cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts // 4 pixel loop small blocks. s4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5347,7 +5300,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel loop l4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5373,7 +5326,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, paddd xmm3, [esi + edx * 4 + 48] lea esi, [esi + 64] - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area cvtdq2ps xmm1, xmm1 mulps xmm0, xmm4 mulps xmm1, xmm4 @@ -5422,8 +5375,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { +void ComputeCumulativeSumRow_SSE2(const uint8* row, + int32* cumsum, + const int32* previous_cumsum, + int width) { __asm { mov eax, row mov edx, cumsum @@ -5505,10 +5460,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, + int src_argb_stride, + uint8* dst_argb, + const float* uv_dudv, + int width) { __asm { push esi push edi @@ -5519,7 +5475,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride + shl esi, 16 // 4, stride add esi, 4 movd xmm5, esi sub ecx, 4 @@ -5528,37 +5484,37 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 + movdqa xmm0, xmm2 // x0, y0, x1, y1 addps xmm0, xmm7 movlhps xmm2, xmm0 movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 + addps xmm4, xmm4 // dudv *= 4 // 4 pixel loop l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 movq qword ptr [edx], xmm1 movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 movd xmm6, [eax + esi] // read pixel 2 movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] sub ecx, 4 @@ -5570,10 +5526,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, // 1 pixel loop l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel movd [edx], xmm0 @@ -5590,15 +5546,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5607,7 +5564,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax @@ -5634,14 +5591,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates + vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] @@ -5651,7 +5608,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5666,16 +5623,17 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, // Bilinear filter 16x2 -> 16x1 // TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5684,7 +5642,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax @@ -5703,7 +5661,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 + psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 @@ -5747,15 +5705,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width + mov ecx, [esp + 16] // width wloop: movdqu xmm0, [eax] @@ -5773,15 +5732,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // width wloop: vmovdqu ymm0, [eax] @@ -5801,19 +5761,20 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + const uint8* shuffler, + int width) { __asm { push ebx push esi - mov eax, [esp + 8 + 4] // src_argb - mov edx, [esp + 8 + 8] // dst_argb - mov esi, [esp + 8 + 12] // shuffler - mov ecx, [esp + 8 + 16] // width + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // width pxor xmm5, xmm5 - mov ebx, [esi] // shuffler + mov ebx, [esi] // shuffler cmp ebx, 0x03000102 je shuf_3012 cmp ebx, 0x00010203 @@ -5823,7 +5784,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, cmp ebx, 0x02010003 je shuf_2103 - // TODO(fbarchard): Use one source pointer and 3 offsets. + // TODO(fbarchard): Use one source pointer and 3 offsets. shuf_any1: movzx ebx, byte ptr [esi] movzx ebx, byte ptr [eax + ebx] @@ -5849,7 +5810,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm5 punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB pshuflw xmm0, xmm0, 01Bh pshufhw xmm1, xmm1, 01Bh pshuflw xmm1, xmm1, 01Bh @@ -5866,7 +5827,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm5 punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB pshuflw xmm0, xmm0, 039h pshufhw xmm1, xmm1, 039h pshuflw xmm1, xmm1, 039h @@ -5883,7 +5844,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm5 punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA pshuflw xmm0, xmm0, 093h pshufhw xmm1, xmm1, 093h pshuflw xmm1, xmm1, 093h @@ -5900,7 +5861,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm5 punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB pshuflw xmm0, xmm0, 0C6h pshufhw xmm1, xmm1, 0C6h pshuflw xmm1, xmm1, 0C6h @@ -5923,30 +5884,30 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV + punpcklbw xmm0, xmm2 // YUYV punpckhbw xmm1, xmm2 movdqu [edi], xmm0 movdqu [edi + 16], xmm1 @@ -5960,30 +5921,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY + punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 @@ -5998,22 +5959,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, + const float* poly, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. // 2 pixel loop. convertloop: -// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel -// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 @@ -6057,25 +6018,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, + const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 vbroadcastf128 ymm6, [ecx + 32] // C2 vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ // 2 pixel loop. convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats + vcvtdq2ps ymm0, ymm0 // X 8 floats vmulps ymm2, ymm0, ymm0 // X * X vmulps ymm3, ymm0, ymm7 // C3 * X vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X @@ -6095,16 +6056,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) void HalfFloatRow_SSE2(const uint16* src, + uint16* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + sub edx, eax + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + add eax, 16 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [eax + edx - 16], xmm2 + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +__declspec(naked) void HalfFloatRow_AVX2(const uint16* src, + uint16* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + sub edx, eax + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + add eax, 32 + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [eax + edx - 32], ymm2 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) void HalfFloatRow_F16C(const uint16* src, + uint16* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + vbroadcastss ymm4, [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + sub edx, eax + + // 16 pixel loop. + convertloop: + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts + add eax, 32 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, - int width) { +__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, + const uint8* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6131,13 +6201,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, + const uint8* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6162,27 +6233,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - int width, - const uint8* luma, uint32 lumacoeff) { +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, + int width, + const uint8* luma, + uint32 lumacoeff) { __asm { push esi push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff pshufd xmm2, xmm2, 0 pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 psllw xmm4, 8 pxor xmm5, xmm5 // 4 pixel loop. convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr + movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 phaddw xmm0, xmm0 pand xmm0, xmm4 // mask out low bits diff --git a/files/source/scale.cc b/files/source/scale.cc index 36e3fe52..a5c7f7ad 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -33,17 +33,24 @@ static __inline int Abs(int v) { // This is an optimized version for scaling down a plane to 1/2 of // its original size. -static void ScalePlaneDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); + filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear + ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -51,37 +58,47 @@ static void ScalePlaneDown2(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : - ScaleRowDown2Box_Any_NEON); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON + : ScaleRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : - ScaleRowDown2Box_NEON); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_NEON + : ScaleRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : - ScaleRowDown2Box_Any_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : - ScaleRowDown2Box_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); } } #endif #if defined(HAS_SCALEROWDOWN2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : - ScaleRowDown2Box_Any_AVX2); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : - ScaleRowDown2Box_AVX2); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); } } #endif @@ -89,8 +106,22 @@ static void ScalePlaneDown2(int src_width, int src_height, if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; + ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; + } +#endif +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } } #endif @@ -105,18 +136,25 @@ static void ScalePlaneDown2(int src_width, int src_height, } } -static void ScalePlaneDown2_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_16_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : - ScaleRowDown2Box_16_C); + filtering == kFilterNone + ? ScaleRowDown2_16_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C + : ScaleRowDown2Box_16_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -124,23 +162,25 @@ static void ScalePlaneDown2_16(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : - ScaleRowDown2_16_NEON; + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : - ScaleRowDown2Box_16_SSE2); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_SSE2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 + : ScaleRowDown2Box_16_SSE2); } #endif #if defined(HAS_SCALEROWDOWN2_16_DSPR2) if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; } #endif @@ -159,24 +199,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height, // This is an optimized version for scaling down a plane to 1/4 of // its original size. -static void ScalePlaneDown4(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } @@ -184,8 +230,8 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } @@ -193,8 +239,8 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; } @@ -204,8 +250,16 @@ static void ScalePlaneDown4(int src_width, int src_height, if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; + ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; + } +#endif +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } } #endif @@ -219,38 +273,44 @@ static void ScalePlaneDown4(int src_width, int src_height, } } -static void ScalePlaneDown4_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown4_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : - ScaleRowDown4_16_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN4_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : - ScaleRowDown4_16_SSE2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif #if defined(HAS_SCALEROWDOWN4_16_DSPR2) if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; } #endif @@ -265,11 +325,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height, } // Scale plane down, 3/4 - -static void ScalePlaneDown34(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, @@ -277,6 +340,8 @@ static void ScalePlaneDown34(int src_width, int src_height, void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_C; @@ -346,8 +411,7 @@ static void ScalePlaneDown34(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -363,10 +427,14 @@ static void ScalePlaneDown34(int src_width, int src_height, } } -static void ScalePlaneDown34_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown34_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, @@ -374,6 +442,8 @@ static void ScalePlaneDown34_16(int src_width, int src_height, void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_C; @@ -425,8 +495,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -442,7 +511,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } - // Scale plane, 3/8 // This is an optimized version for scaling down a plane to 3/8 // of its original size. @@ -458,10 +526,14 @@ static void ScalePlaneDown34_16(int src_width, int src_height, // ggghhhii // Boxes are 3x3, 2x3, 3x2 and 2x2 -static void ScalePlaneDown38(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, @@ -470,6 +542,8 @@ static void ScalePlaneDown38(int src_width, int src_height, uint8* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; assert(dst_width % 3 == 0); + (void)src_width; + (void)src_height; if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; @@ -530,6 +604,26 @@ static void ScalePlaneDown38(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEROWDOWN38_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MSA; + ScaleRowDown38_2 = ScaleRowDown38_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; + } + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -554,10 +648,14 @@ static void ScalePlaneDown38(int src_width, int src_height, } } -static void ScalePlaneDown38_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown38_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, @@ -565,6 +663,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_C; @@ -654,8 +754,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { return sum; } -static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16* src_ptr, + uint8* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -666,13 +770,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + *dst_ptr++ = + SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols2_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32* src_ptr, + uint16* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -684,12 +793,17 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, x += dx; boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int, + const uint16* src_ptr, + uint8* dst_ptr) { int scaleval = 65536 / boxheight; int i; src_ptr += (x >> 16); @@ -698,8 +812,12 @@ static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, } } -static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16* src_ptr, + uint8* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -710,8 +828,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, } } -static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols1_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32* src_ptr, + uint16* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -728,10 +850,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, // one pixel of destination using fixed point (16.16) to step // through source, sampling a box of pixel with simple // averaging. -static void ScalePlaneBox(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -739,16 +865,16 @@ static void ScalePlaneBox(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { // Allocate a row buffer of uint16. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C: - ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + const uint16* src_ptr, uint8* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = ScaleAddRow_C; #if defined(HAS_SCALEADDROW_SSE2) @@ -775,6 +901,22 @@ static void ScalePlaneBox(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif +#if defined(HAS_SCALEADDROW_DSPR2) + if (TestCpuFlag(kCpuHasDSPR2)) { + ScaleAddRow = ScaleAddRow_Any_DSPR2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_DSPR2; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; @@ -787,7 +929,7 @@ static void ScalePlaneBox(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row16, 0, src_width * 2); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16 *)(row16), src_width); + ScaleAddRow(src, (uint16*)(row16), src_width); src += src_stride; } ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); @@ -797,10 +939,14 @@ static void ScalePlaneBox(int src_width, int src_height, } } -static void ScalePlaneBox_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneBox_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -808,15 +954,15 @@ static void ScalePlaneBox_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { // Allocate a row buffer of uint32. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; + const uint32* src_ptr, uint16* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = ScaleAddRow_16_C; @@ -837,7 +983,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row32, 0, src_width * 4); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint32 *)(row32), src_width); + ScaleAddRow(src, (uint32*)(row32), src_width); src += src_stride; } ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); @@ -848,10 +994,14 @@ static void ScalePlaneBox_16(int src_width, int src_height, } // Scale plane down with bilinear interpolation. -void ScalePlaneBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -864,14 +1014,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, + int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -906,7 +1056,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } } #endif - +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -944,10 +1101,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, free_aligned_buffer_64(row); } -void ScalePlaneBilinearDown_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -960,14 +1121,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1011,7 +1172,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } #endif - #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_16_SSSE3; @@ -1041,10 +1201,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } // Scale up down with bilinear interpolation. -void ScalePlaneBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1053,14 +1217,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, + int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -1172,10 +1336,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } -void ScalePlaneBilinearUp_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1184,14 +1352,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1308,20 +1476,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScalePlaneSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_ptr, + uint8* dst_ptr) { int i; - void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = ScaleCols_C; + void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x, + int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1340,20 +1512,24 @@ static void ScalePlaneSimple(int src_width, int src_height, } } -static void ScalePlaneSimple_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneSimple_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_ptr, + uint16* dst_ptr) { int i; - void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = ScaleCols_16_C; + void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width, + int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1366,8 +1542,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } @@ -1377,14 +1552,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height, // This function dispatches to a specialized scaler based on scale factor. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8* src, + int src_stride, + int src_width, + int src_height, + uint8* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1403,46 +1582,42 @@ void ScalePlane(const uint8* src, int src_stride, if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1455,19 +1630,23 @@ void ScalePlane(const uint8* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, - enum FilterMode filtering) { +void ScalePlane_16(const uint16* src, + int src_stride, + int src_width, + int src_height, + uint16* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1486,16 +1665,13 @@ void ScalePlane_16(const uint16* src, int src_stride, if (dst_width == src_width) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical_16(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1508,15 +1684,14 @@ void ScalePlane_16(const uint16* src, int src_stride, return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + filtering != kFilterBilinear) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1524,8 +1699,8 @@ void ScalePlane_16(const uint16* src, int src_stride, } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1538,101 +1713,121 @@ void ScalePlane_16(const uint16* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } // Scale an I420 image. // This function in turn calls a scaling function for each plane. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8* dst_y, + int dst_stride_y, + uint8* dst_u, + int dst_stride_u, + uint8* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16* src_y, + int src_stride_y, + const uint16* src_u, + int src_stride_u, + const uint16* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16* dst_y, + int dst_stride_y, + uint16* dst_u, + int dst_stride_u, + uint16* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane_16(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } // Deprecated api LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8* dst_y, + uint8* dst_u, + uint8* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate) { - return I420Scale(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_width, src_height, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - dst_width, dst_height, - interpolate ? kFilterBox : kFilterNone); + return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_width, src_height, dst_y, dst_stride_y, + dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, + dst_height, interpolate ? kFilterBox : kFilterNone); } // Deprecated api LIBYUV_API -int ScaleOffset(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_yoffset, +int ScaleOffset(const uint8* src, + int src_width, + int src_height, + uint8* dst, + int dst_width, + int dst_height, + int dst_yoffset, LIBYUV_BOOL interpolate) { // Chroma requires offset to multiple of 2. int dst_yoffset_even = dst_yoffset & ~1; @@ -1643,26 +1838,21 @@ int ScaleOffset(const uint8* src, int src_width, int src_height, int aheight = dst_height - dst_yoffset_even * 2; // actual output height const uint8* src_y = src; const uint8* src_u = src + src_width * src_height; - const uint8* src_v = src + src_width * src_height + - src_halfwidth * src_halfheight; + const uint8* src_v = + src + src_width * src_height + src_halfwidth * src_halfheight; uint8* dst_y = dst + dst_yoffset_even * dst_width; - uint8* dst_u = dst + dst_width * dst_height + - (dst_yoffset_even >> 1) * dst_halfwidth; + uint8* dst_u = + dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth; uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + (dst_yoffset_even >> 1) * dst_halfwidth; - if (!src || src_width <= 0 || src_height <= 0 || - !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || + if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 || + dst_height <= 0 || dst_yoffset_even < 0 || dst_yoffset_even >= dst_height) { return -1; } - return I420Scale(src_y, src_width, - src_u, src_halfwidth, - src_v, src_halfwidth, - src_width, src_height, - dst_y, dst_width, - dst_u, dst_halfwidth, - dst_v, dst_halfwidth, - dst_width, aheight, + return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth, + src_width, src_height, dst_y, dst_width, dst_u, + dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight, interpolate ? kFilterBox : kFilterNone); } diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc index ed76a9e4..d64ba7a9 100644 --- a/files/source/scale_any.cc +++ b/files/source/scale_any.cc @@ -19,16 +19,15 @@ extern "C" { #endif // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - int dst_width, int x, int dx) { \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, \ - dst_width & MASK, x + n * dx, dx); \ - } +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \ + int dx) { \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx); \ + } #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) @@ -37,167 +36,378 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, 4, 3) +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) #endif #undef CANY // Fixed scale down. -#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. -#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } #ifdef HAS_SCALEROWDOWN2_SSSE3 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, - 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) -SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, - ScaleRowDown2Linear_C, 2, 1, 31) -SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, - 2, 1, 31) -SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, - 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_C, 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, + ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN2_MSA +SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_MSA, + ScaleRowDown2Linear_MSA, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_MSA, + ScaleRowDown2Box_MSA, + ScaleRowDown2Box_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) -SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, - 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN4_NEON SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, + ScaleRowDown4Box_NEON, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_MSA +SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_MSA, + ScaleRowDown4Box_MSA, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN34_SSSE3 -SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) #endif #ifdef HAS_SCALEROWDOWN34_NEON -SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, + ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, + ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) #endif #ifdef HAS_SCALEROWDOWN38_SSSE3 -SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) -SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) #endif #ifdef HAS_SCALEROWDOWN38_NEON -SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_Any_NEON, + ScaleRowDown38_NEON, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, + ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, + ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_MSA +SDANY(ScaleRowDown38_Any_MSA, + ScaleRowDown38_MSA, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_MSA, + ScaleRowDown38_3_Box_MSA, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_MSA, + ScaleRowDown38_2_Box_MSA, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) #endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 -SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, - ScaleARGBRowDown2_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, - ScaleARGBRowDown2Linear_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, - ScaleARGBRowDown2Box_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2_Any_SSE2, + ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, + ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, + ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWN2_NEON -SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, - ScaleARGBRowDown2_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, - ScaleARGBRowDown2Linear_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, - ScaleARGBRowDown2Box_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2_Any_NEON, + ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, + ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, + ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, + 2, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #undef SDANY // Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \ - src_stepx, dst_ptr + n * BPP, r); \ - } +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 -SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_SSE2, + ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, + ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON -SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_NEON, + ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, + ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif // Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ - if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ - } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ - } +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } #ifdef HAS_SCALEADDROW_SSE2 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) @@ -208,14 +418,15 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) #ifdef HAS_SCALEADDROW_NEON SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #endif +#ifdef HAS_SCALEADDROW_MSA +SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_DSPR2 +SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15) +#endif #undef SAANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc index 17f51ae9..1ea28f0d 100644 --- a/files/source/scale_argb.cc +++ b/files/source/scale_argb.cc @@ -30,20 +30,31 @@ static __inline int Abs(int v) { // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. -static void ScaleARGBDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) = - filtering == kFilterNone ? ScaleARGBRowDown2_C : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : - ScaleARGBRowDown2Box_C); - assert(dx == 65536 * 2); // Test scale factor of 2. + filtering == kFilterNone + ? ScaleARGBRowDown2_C + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C + : ScaleARGBRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { @@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height, #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : - ScaleARGBRowDown2Box_Any_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 + : ScaleARGBRowDown2Box_Any_SSE2); if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : - ScaleARGBRowDown2Box_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 + : ScaleARGBRowDown2Box_SSE2); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : - ScaleARGBRowDown2Box_Any_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON + : ScaleARGBRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : - ScaleARGBRowDown2Box_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON + : ScaleARGBRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); } } #endif @@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height, // ScaleARGB ARGB, 1/4 // This is an optimized version for scaling down a ARGB to 1/4 of // its original size. -static void ScaleARGBDown4Box(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy) { int j; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + uint8* dst_argb, int dst_width) = + ScaleARGBRowDown2Box_C; // Advance to odd row, even column. src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - assert(dx == 65536 * 4); // Test scale factor of 4. + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); - ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, - row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; @@ -137,11 +183,18 @@ static void ScaleARGBDown4Box(int src_width, int src_height, // ScaleARGB ARGB Even // This is an optimized version for scaling down a ARGB to even // multiple of its original size. -static void ScaleARGBDownEven(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int col_step = dx >> 16; @@ -149,26 +202,38 @@ static void ScaleARGBDownEven(int src_width, int src_height, void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, int src_step, uint8* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + (void)src_width; + (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : - ScaleARGBRowDownEven_Any_SSE2; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 + : ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : - ScaleARGBRowDownEven_SSE2; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : - ScaleARGBRowDownEven_Any_NEON; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON + : ScaleARGBRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : - ScaleARGBRowDownEven_NEON; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; } } #endif @@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height, } // Scale ARGB down with bilinear interpolation. -static void ScaleARGBBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; int64 xlast = x + (int64)(dst_width - 1) * dx; int64 xl = (dx >= 0) ? x : xlast; int64 xr = (dx >= 0) ? xlast : x; int clip_src_width; - xl = (xl >> 16) & ~3; // Left edge aligned. - xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. if (xr > src_width) { xr = src_width; @@ -235,14 +307,22 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && + IS_ALIGNED(src_stride, 4)) { InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(clip_src_width, 4)) { InterpolateRow = InterpolateRow_DSPR2; } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; @@ -286,18 +366,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } // Scale ARGB up with bilinear interpolation. -static void ScaleARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -325,14 +412,22 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && + IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_DSPR2; } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -423,8 +518,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, #ifdef YUVSCALEUP // Scale YUV to ARGB up with bilinear interpolation. -static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, +static void ScaleYUVToARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, int src_stride_y, int src_stride_u, int src_stride_v, @@ -433,14 +530,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - int x, int dx, int y, int dy, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + const uint8* v_buf, uint8* rgb_buf, int width) = + I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; @@ -474,10 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -503,18 +609,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && + IS_ALIGNED(dst_stride_argb, 4)) { InterpolateRow = InterpolateRow_DSPR2; } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -635,15 +749,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScaleARGBSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int dx, + int y, + int dy) { int j; - void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width, + int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + (void)src_height; #if defined(HAS_SCALEARGBCOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBCols = ScaleARGBCols_SSE2; @@ -667,8 +789,8 @@ static void ScaleARGBSimple(int src_width, int src_height, } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, + dx); dst_argb += dst_stride; y += dy; } @@ -677,11 +799,18 @@ static void ScaleARGBSimple(int src_width, int src_height, // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. -static void ScaleARGB(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +static void ScaleARGB(const uint8* src, + int src_stride, + int src_width, + int src_height, + uint8* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -690,8 +819,7 @@ static void ScaleARGB(const uint8* src, int src_stride, int dy = 0; // ARGB does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. @@ -700,17 +828,17 @@ static void ScaleARGB(const uint8* src, int src_stride, src = src + (src_height - 1) * src_stride; src_stride = -src_stride; } - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (clip_x) { - int64 clipf = (int64)(clip_x) * dx; + int64 clipf = (int64)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 4; dst += clip_x * 4; } if (clip_y) { - int64 clipf = (int64)(clip_y) * dy; + int64 clipf = (int64)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * src_stride; dst += clip_y * dst_stride; @@ -725,24 +853,20 @@ static void ScaleARGB(const uint8* src, int src_stride, if (!(dx & 0x10000) && !(dy & 0x10000)) { if (dx == 0x20000) { // Optimized 1/2 downsample. - ScaleARGBDown2(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (dx == 0x40000 && filtering == kFilterBox) { // Optimized 1/4 box downsample. - ScaleARGBDown4Box(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); return; } - ScaleARGBDownEven(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } // Optimized odd scale down. ie 3, 5, 7, 9x. @@ -759,96 +883,105 @@ static void ScaleARGB(const uint8* src, int src_stride, } if (dx == 0x10000 && (x & 0xffff) == 0) { // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical(src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, y, dy, 4, filtering); + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); return; } if (filtering && dy < 65536) { - ScaleARGBBilinearUp(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (filtering) { - ScaleARGBBilinearDown(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); } LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - !dst_argb || dst_width <= 0 || dst_height <= 0 || - clip_x < 0 || clip_y < 0 || + if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || + dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || clip_width > 32768 || clip_height > 32768 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, + clip_height, filtering); return 0; } // Scale an ARGB image. LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_argb || dst_width <= 0 || dst_height <= 0) { + if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - 0, 0, dst_width, dst_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, + filtering); return 0; } // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +int YUVToARGBScaleClip(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4); int r; - I420ToARGB(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - argb_buffer, src_width * 4, - src_width, src_height); - - r = ARGBScaleClip(argb_buffer, src_width * 4, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, - filtering); + (void)src_fourcc; // TODO(fbarchard): implement and/or assert. + (void)dst_fourcc; + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); + + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); free(argb_buffer); return r; } diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc index 3507aa4d..1bef39df 100644 --- a/files/source/scale_common.cc +++ b/files/source/scale_common.cc @@ -28,9 +28,12 @@ static __inline int Abs(int v) { } // CPU agnostic row functions -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown2_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Linear_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { const uint8* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { const uint16* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -86,8 +98,10 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; int x; @@ -103,8 +117,10 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; int x; @@ -125,8 +141,10 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, dst[0] = (s[0] + t[0] + 1) >> 1; } -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown2Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { const uint16* s = src_ptr; const uint16* t = src_ptr + src_stride; int x; @@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown34_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown34_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -269,8 +303,10 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; int x; @@ -291,8 +327,10 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* d, + int dst_width) { const uint16* s = src_ptr; const uint16* t = src_ptr + src_stride; int x; @@ -314,8 +352,10 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; int x; @@ -336,8 +376,10 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* d, + int dst_width) { const uint16* s = src_ptr; const uint16* t = src_ptr + src_stride; int x; @@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, // (1-f)a + fb can be replaced with a + f(b-a) #if defined(__arm__) || defined(__aarch64__) -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#define BLENDER(a, b, f) \ + (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) #else -// inteluses 7 bit math with rounding. -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) #endif -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -450,8 +511,11 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x32, int dx) { +void ScaleFilterCols64_C(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x32, + int dx) { int64 x = (int64)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { @@ -477,11 +541,14 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, #undef BLENDER // Same as 8 bit arm blender but return is cast to uint16 -#define BLENDER(a, b, f) (uint16)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#define BLENDER(a, b, f) \ + (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -504,8 +571,11 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } } -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x32, int dx) { +void ScaleFilterCols64_16_C(uint16* dst_ptr, + const uint16* src_ptr, + int dst_width, + int x32, + int dx) { int64 x = (int64)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { @@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, } #undef BLENDER -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown38_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown38_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -559,25 +635,29 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, // 8x3 -> 3x1 void ScaleRowDown38_3_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -585,66 +665,80 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr, void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { + uint16* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } } // 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -680,11 +774,12 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { void ScaleARGBRowDown2_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); - int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[1]; dst[1] = src[3]; @@ -698,8 +793,10 @@ void ScaleARGBRowDown2_C(const uint8* src_argb, void ScaleARGBRowDown2Linear_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; @@ -710,29 +807,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDown2Box_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += 8; dst_argb += 4; } } -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEven_C(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); - + (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; @@ -748,25 +853,33 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += src_stepx * 4; dst_argb += 4; } } // Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); int j; @@ -782,8 +895,11 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, } } -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { +void ScaleARGBCols64_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x32, + int dx) { int64 x = (int64)(x32); const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); @@ -801,11 +917,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBColsUp2_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst[1] = dst[0] = src[0]; src += 1; @@ -818,15 +939,18 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 -#define BLENDERC(a, b, f, s) (uint32)( \ - BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ - BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ + BLENDERC(a, b, f, 0) -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); int j; @@ -854,8 +978,11 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, } } -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { +void ScaleARGBFilterCols64_C(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x32, + int dx) { int64 x = (int64)(x32); const uint32* src = (const uint32*)(src_argb); uint32* dst = (uint32*)(dst_argb); @@ -889,16 +1016,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8* src_argb, + uint8* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(bpp >= 1 && bpp <= 4); @@ -931,15 +1064,23 @@ void ScalePlaneVertical(int src_height, } #endif #if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && + IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_Any_DSPR2; if (IS_ALIGNED(dst_width_bytes, 4)) { InterpolateRow = InterpolateRow_DSPR2; } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; @@ -948,23 +1089,29 @@ void ScalePlaneVertical(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_bytes, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); dst_argb += dst_stride; y += dy; } } void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16* src_argb, + uint16* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; + void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(wpp >= 1 && wpp <= 2); @@ -1005,9 +1152,9 @@ void ScalePlaneVertical_16(int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && + IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_Any_16_DSPR2; if (IS_ALIGNED(dst_width_bytes, 4)) { InterpolateRow = InterpolateRow_16_DSPR2; @@ -1022,16 +1169,18 @@ void ScalePlaneVertical_16(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_words, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_words, yf); dst_argb += dst_stride; y += dy; } } // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering) { if (src_width < 0) { src_width = -src_width; @@ -1078,17 +1227,21 @@ int FixedDiv_C(int num, int div) { // Divide num by div and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { - return (int)((((int64)(num) << 16) - 0x00010001) / - (div - 1)); + return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1)); } #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy) { + int* x, + int* y, + int* dx, + int* dy) { assert(x != NULL); assert(y != NULL); assert(dx != NULL); @@ -1120,7 +1273,7 @@ void ScaleSlope(int src_width, int src_height, *x = 0; } if (dst_height <= src_height) { - *dy = FixedDiv(src_height, dst_height); + *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_height > 1) { *dy = FixedDiv1(src_height, dst_height); diff --git a/files/source/scale_mips.cc b/files/source/scale_dspr2.cc index ae953073..ddedcbf4 100644 --- a/files/source/scale_mips.cc +++ b/files/source/scale_dspr2.cc @@ -17,168 +17,167 @@ extern "C" { #endif // This module is for GCC MIPS DSPR2 -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) +#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - // TODO(fbarchard): Use odd pixels instead of even. - "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| - "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| - "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| - "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t8, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t1, 8(%[dst]) \n" - "sw $t2, 12(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 16 \n" - - "2: \n" - "andi $t9, %[dst_width], 0xf \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t0, 0(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 2 \n" - "addiu $t9, $t9, -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 + "beqz $t9, 2f \n" + " nop \n" + + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + // TODO(fbarchard): Use odd pixels instead of even. + "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1| + "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9| + "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17| + "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t8, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t1, 8(%[dst]) \n" + "sw $t2, 12(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 16 \n" + + "2: \n" + "andi $t9, %[dst_width], 0xf \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t0, 1(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 2 \n" + "addiu $t9, $t9, -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) + : [dst_width] "r"(dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); } -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { const uint8* t = src_ptr + src_stride; - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 - "bltz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 0(%[t]) \n" // |19|18|17|16| - "lw $t5, 4(%[t]) \n" // |23|22|21|20| - "lw $t6, 8(%[t]) \n" // |27|26|25|24| - "lw $t7, 12(%[t]) \n" // |31|30|29|28| - "addiu $t9, $t9, -1 \n" - "srl $t8, $t0, 16 \n" // |X|X|3|2| - "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| - "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| - "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| - "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| - "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 - "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 - "srl $t8, $t1, 16 \n" // |X|X|7|6| - "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| - "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| - "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| - "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| - "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 - "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 - "srl $t8, $t2, 16 \n" // |X|X|11|10| - "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| - "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| - "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| - "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| - "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 - "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 - "srl $t8, $t3, 16 \n" // |X|X|15|14| - "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| - "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| - "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| - "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| - "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 - "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 - "addiu %[src_ptr], %[src_ptr], 16 \n" - "addiu %[t], %[t], 16 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "sb $t1, 2(%[dst]) \n" - "sb $t5, 3(%[dst]) \n" - "sb $t2, 4(%[dst]) \n" - "sb $t6, 5(%[dst]) \n" - "sb $t3, 6(%[dst]) \n" - "sb $t7, 7(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 0x7 \n" // x = residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lwr $t1, 0(%[src_ptr]) \n" - "lwl $t1, 3(%[src_ptr]) \n" - "lwr $t2, 0(%[t]) \n" - "lwl $t2, 3(%[t]) \n" - "srl $t8, $t1, 16 \n" - "ins $t1, $t2, 16, 16 \n" - "ins $t2, $t8, 0, 16 \n" - "raddu.w.qb $t1, $t1 \n" - "raddu.w.qb $t2, $t2 \n" - "shra_r.w $t1, $t1, 2 \n" - "shra_r.w $t2, $t2, 2 \n" - "sb $t1, 0(%[dst]) \n" - "sb $t2, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -2 \n" - "addiu %[t], %[t], 4 \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 2 \n" - - "3: \n" - ".set pop \n" - - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), [t] "+r" (t) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 + "bltz $t9, 2f \n" + " nop \n" + + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 0(%[t]) \n" // |19|18|17|16| + "lw $t5, 4(%[t]) \n" // |23|22|21|20| + "lw $t6, 8(%[t]) \n" // |27|26|25|24| + "lw $t7, 12(%[t]) \n" // |31|30|29|28| + "addiu $t9, $t9, -1 \n" + "srl $t8, $t0, 16 \n" // |X|X|3|2| + "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| + "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| + "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| + "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| + "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 + "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 + "srl $t8, $t1, 16 \n" // |X|X|7|6| + "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| + "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| + "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| + "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| + "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 + "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 + "srl $t8, $t2, 16 \n" // |X|X|11|10| + "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| + "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| + "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| + "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| + "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 + "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 + "srl $t8, $t3, 16 \n" // |X|X|15|14| + "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| + "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| + "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| + "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| + "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 + "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 + "addiu %[src_ptr], %[src_ptr], 16 \n" + "addiu %[t], %[t], 16 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "sb $t1, 2(%[dst]) \n" + "sb $t5, 3(%[dst]) \n" + "sb $t2, 4(%[dst]) \n" + "sb $t6, 5(%[dst]) \n" + "sb $t3, 6(%[dst]) \n" + "sb $t7, 7(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 0x7 \n" // x = residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lwr $t1, 0(%[src_ptr]) \n" + "lwl $t1, 3(%[src_ptr]) \n" + "lwr $t2, 0(%[t]) \n" + "lwl $t2, 3(%[t]) \n" + "srl $t8, $t1, 16 \n" + "ins $t1, $t2, 16, 16 \n" + "ins $t2, $t8, 0, 16 \n" + "raddu.w.qb $t1, $t1 \n" + "raddu.w.qb $t2, $t2 \n" + "shra_r.w $t1, $t1, 2 \n" + "shra_r.w $t2, $t2, 2 \n" + "sb $t1, 0(%[dst]) \n" + "sb $t2, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -2 \n" + "addiu %[t], %[t], 4 \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 2 \n" + + "3: \n" + ".set pop \n" + + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t) + : [dst_width] "r"(dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); } -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( +void ScaleRowDown4_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" @@ -186,7 +185,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "beqz $t9, 2f \n" " nop \n" - "1: \n" + "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| @@ -199,8 +198,8 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| - "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| - "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| + "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2| + "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18| "addiu %[src_ptr], %[src_ptr], 32 \n" "addiu $t9, $t9, -1 \n" "sw $t1, 0(%[dst]) \n" @@ -208,44 +207,43 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bgtz $t9, 1b \n" " addiu %[dst], %[dst], 8 \n" - "2: \n" + "2: \n" "andi $t9, %[dst_width], 7 \n" // residue "beqz $t9, 3f \n" " nop \n" - "21: \n" - "lbu $t1, 0(%[src_ptr]) \n" + "21: \n" + "lbu $t1, 2(%[src_ptr]) \n" "addiu %[src_ptr], %[src_ptr], 4 \n" "addiu $t9, $t9, -1 \n" "sb $t1, 0(%[dst]) \n" "bgtz $t9, 21b \n" " addiu %[dst], %[dst], 1 \n" - "3: \n" + "3: \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) + : [dst_width] "r"(dst_width) + : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); } -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; const uint8* s2 = s1 + stride; const uint8* s3 = s2 + stride; - __asm__ __volatile__ ( + __asm__ __volatile__( ".set push \n" ".set noreorder \n" "srl $t9, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n" - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[s1]) \n" // |7|6|5|4| "lw $t2, 0(%[s2]) \n" // |11|10|9|8| @@ -299,23 +297,20 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "2: \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [s1] "+r" (s1), - [s2] "+r" (s2), - [s3] "+r" (s3) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2), + [s3] "+r"(s3) + : [dst_width] "r"(dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); } -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( +void ScaleRowDown34_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" - "1: \n" + "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| @@ -347,23 +342,21 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bnez %[dst_width], 1b \n" " addiu %[dst], %[dst], 24 \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); } -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( +void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" "repl.ph $t3, 3 \n" // 0x00030003 - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| @@ -400,26 +393,24 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "sb $t6, 2(%[d]) \n" "bgtz %[dst_width], 1b \n" " addiu %[d], %[d], 3 \n" - "3: \n" + "3: \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) + : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), + [dst_width] "+r"(dst_width) : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); + : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); } -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( +void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* d, + int dst_width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" "repl.ph $t2, 3 \n" // 0x00030003 - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| @@ -452,25 +443,23 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "sb $t6, 2(%[d]) \n" "bgtz %[dst_width], 1b \n" " addiu %[d], %[d], 3 \n" - "3: \n" + "3: \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) + : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), + [dst_width] "+r"(dst_width) : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); + : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); } -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( +void ScaleRowDown38_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + __asm__ __volatile__( ".set push \n" ".set noreorder \n" - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| @@ -501,26 +490,24 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bgez $t8, 1b \n" " addiu %[dst], %[dst], 12 \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) + : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) : - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); } -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { intptr_t stride = src_stride; const uint8* t = src_ptr + stride; const int c = 0x2AAA; - __asm__ __volatile__ ( + __asm__ __volatile__( ".set push \n" ".set noreorder \n" - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| @@ -554,18 +541,16 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bgtz %[dst_width], 1b \n" " sb $t0, -3(%[dst_ptr]) \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [t] "+r" (t), - [dst_width] "+r" (dst_width) - : [c] "r" (c) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); + : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t), + [dst_width] "+r"(dst_width) + : [c] "r"(c) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); } void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; stride += stride; @@ -573,11 +558,11 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, const int c1 = 0x1C71; const int c2 = 0x2AAA; - __asm__ __volatile__ ( + __asm__ __volatile__( ".set push \n" ".set noreorder \n" - "1: \n" + "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| @@ -624,15 +609,55 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, "bgtz %[dst_width], 1b \n" " sb $t0, -3(%[dst_ptr]) \n" ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [s1] "+r" (s1), - [s2] "+r" (s2), - [dst_width] "+r" (dst_width) - : [c1] "r" (c1), [c2] "r" (c2) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); + : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1), + [s2] "+r"(s2), [dst_width] "+r"(dst_width) + : [c1] "r"(c1), [c2] "r"(c2) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); +} + +void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + int x; + for (x = 0; x < ((src_width - 1)); x += 8) { + uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4; + uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8; + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + "lw %[tmp_t5], 0(%[src_ptr]) \n" + "lw %[tmp_t6], 4(%[src_ptr]) \n" + "lw %[tmp_t1], 0(%[dst_ptr]) \n" + "lw %[tmp_t2], 4(%[dst_ptr]) \n" + "lw %[tmp_t3], 8(%[dst_ptr]) \n" + "lw %[tmp_t4], 12(%[dst_ptr]) \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n" + "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n" + "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n" + "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n" + "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n" + "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n" + "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n" + "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n" + "sw %[tmp_t1], 0(%[dst_ptr]) \n" + "sw %[tmp_t2], 4(%[dst_ptr]) \n" + "sw %[tmp_t3], 8(%[dst_ptr]) \n" + "sw %[tmp_t4], 12(%[dst_ptr]) \n" + ".set pop \n" + : + [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3), + [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), + [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr) + : [dst_ptr] "r"(dst_ptr)); + src_ptr += 8; + dst_ptr += 8; + } + + if ((src_width)&7) { + for (x = 0; x < ((src_width - 1) & 7); x += 1) { + dst_ptr[0] += src_ptr[0]; + src_ptr += 1; + dst_ptr += 1; + } + } } #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) @@ -641,4 +666,3 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, } // extern "C" } // namespace libyuv #endif - diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc index e2f88544..f0ac56fc 100644 --- a/files/source/scale_gcc.cc +++ b/files/source/scale_gcc.cc @@ -21,85 +21,82 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( LABELALIGN "1: \n" @@ -120,8 +117,11 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" @@ -149,8 +149,10 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" @@ -189,8 +191,11 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } #ifdef HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( LABELALIGN "1: \n" @@ -213,8 +218,11 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" @@ -244,8 +252,10 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { asm volatile ( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" @@ -286,8 +296,11 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" @@ -314,8 +327,10 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { intptr_t stridex3; asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" @@ -368,10 +383,12 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } - #ifdef HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n" @@ -400,8 +417,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { asm volatile ( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" @@ -455,17 +474,20 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); +void ScaleRowDown34_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); asm volatile ( LABELALIGN "1: \n" @@ -492,25 +514,26 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); + uint8* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); asm volatile ( LABELALIGN "1: \n" @@ -557,25 +580,26 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); + uint8* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); asm volatile ( LABELALIGN @@ -624,8 +648,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ); } -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -655,18 +682,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); + uint8* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); asm volatile ( LABELALIGN "1: \n" @@ -700,17 +728,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); + uint8* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); asm volatile ( LABELALIGN "1: \n" @@ -790,7 +819,6 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { ); } - #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { @@ -823,17 +851,19 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { intptr_t x0, x1, temp_pixel; asm volatile ( "movd %6,%%xmm2 \n" @@ -867,7 +897,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "pshufb %%xmm5,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm0 \n" "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1 "paddusb %%xmm7,%%xmm1 \n" "pmaddubsw %%xmm0,%%xmm1 \n" "pextrw $0x1,%%xmm2,%k3 \n" @@ -925,8 +955,13 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_SSE2(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; asm volatile ( LABELALIGN "1: \n" @@ -950,7 +985,9 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( LABELALIGN "1: \n" @@ -971,7 +1008,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( LABELALIGN "1: \n" @@ -995,7 +1034,8 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { asm volatile ( LABELALIGN "1: \n" @@ -1025,10 +1065,14 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; + (void)src_stride; asm volatile ( "lea " MEMLEA3(0x00,1,4) ",%1 \n" "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" @@ -1059,8 +1103,10 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); @@ -1102,8 +1148,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ); } -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; asm volatile ( "movd %5,%%xmm2 \n" @@ -1171,8 +1220,13 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; asm volatile ( LABELALIGN "1: \n" @@ -1197,26 +1251,29 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); + asm volatile( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); asm volatile ( "movd %5,%%xmm2 \n" @@ -1283,34 +1340,32 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } diff --git a/files/source/scale_msa.cc b/files/source/scale_msa.cc new file mode 100644 index 00000000..bfcd10fc --- /dev/null +++ b/files/source/scale_msa.cc @@ -0,0 +1,553 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { + int x; + const uint8* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = __msa_aver_u_b(vec1, vec0); + dst1 = __msa_aver_u_b(vec3, vec2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst, 16); + s += 64; + t += 64; + dst += 32; + } +} + +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + const uint8_t* t2 = s + src_stride * 3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); + vec0 += __msa_hadd_u_h(src0, src0); + vec1 += __msa_hadd_u_h(src1, src1); + vec2 += __msa_hadd_u_h(src2, src2); + vec3 += __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + reg0 = __msa_hadd_u_w(vec0, vec0); + reg1 = __msa_hadd_u_w(vec1, vec1); + reg2 = __msa_hadd_u_w(vec2, vec2); + reg3 = __msa_hadd_u_w(vec3, vec3); + reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); + reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); + reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); + reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + s += 64; + t0 += 64; + t1 += 64; + t2 += 64; + dst += 16; + } +} + +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, width; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, vec0; + v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; + (void)src_stride; + + assert(dst_width % 3 == 0); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); + dst0 = __msa_copy_u_d((v2i64)vec0, 0); + dst1 = __msa_copy_u_w((v4i32)vec0, 2); + SD(dst0, dst); + SW(dst1, dst + 8); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8i16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x2AAA; + tmp1 *= const_0x2AAA; + tmp4 *= const_0x4000; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, src4, src5, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8u16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); + vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); + vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); + vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x1C71; + tmp1 *= const_0x1C71; + tmp4 *= const_0x2AAA; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t0 += 32; + t1 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + v16u8 src0; + v8u16 dst0, dst1; + v16i8 zero = {0}; + + assert(src_width > 0); + + for (x = 0; x < src_width; x += 16) { + src0 = LD_UB(src_ptr); + dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); + dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); + dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + ST_UH2(dst0, dst1, dst_ptr, 8); + src_ptr += 16; + dst_ptr += 16; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc index 44b0c808..9b4dce33 100644 --- a/files/source/scale_neon.cc +++ b/files/source/scale_neon.cc @@ -23,8 +23,11 @@ extern "C" { // Provided by Fritz Koenig // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" // load even pixels into q0, odd into q1 @@ -43,8 +46,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -66,8 +72,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" @@ -95,8 +103,11 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -113,12 +124,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( + asm volatile ( "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4 @@ -155,7 +168,9 @@ asm volatile ( // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -175,7 +190,8 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" @@ -234,7 +250,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" @@ -274,21 +291,20 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, } #define HAS_SCALEROWDOWN38_NEON -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; +static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; +static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; // 32 -> 12 void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( MEMACCESS(3) "vld1.8 {q3}, [%3] \n" @@ -314,7 +330,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride * 2; asm volatile ( @@ -433,7 +450,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { asm volatile ( MEMACCESS(4) "vld1.16 {q13}, [%4] \n" @@ -530,8 +548,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRows_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, + int src_width, + int src_height) { const uint8* src_tmp; asm volatile ( "1: \n" @@ -563,6 +584,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD2_DATA8_LANE(n) \ @@ -571,13 +593,17 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %3, %3, %4 \n" \ MEMACCESS(6) \ "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" +// clang-format on -// The NEON version mimics this formula: +// The NEON version mimics this formula (from row_common.cc): // #define BLENDER(a, b, f) (uint8)((int)(a) + -// ((int)(f) * ((int)(b) - (int)(a)) >> 16)) +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_NEON(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_ptr; @@ -640,8 +666,10 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { asm volatile ( "cmp %4, #0 \n" "beq 100f \n" @@ -737,8 +765,11 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" // load even pixels into q0, odd into q1 @@ -760,8 +791,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -788,8 +822,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" @@ -829,8 +865,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( "mov r12, %3, lsl #2 \n" "1: \n" @@ -856,9 +896,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { asm volatile ( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" @@ -902,17 +944,22 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld1.32 {"#dn"["#n"]}, [%6] \n" - -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld1.32 {" #dn "[" #n "]}, [%6] \n" +// clang-format on + +void ScaleARGBCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { int tmp; const uint8* src_tmp = src_argb; asm volatile ( @@ -944,17 +991,22 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, #undef LOAD1_DATA32_LANE +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n" - -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" +// clang-format on + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_argb; diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc index ff277f26..a98b9d03 100644 --- a/files/source/scale_neon64.cc +++ b/files/source/scale_neon64.cc @@ -21,8 +21,11 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" // load even pixels into v0, odd into v1 @@ -41,8 +44,11 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -64,8 +70,10 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" @@ -93,8 +101,11 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -111,12 +122,14 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( + asm volatile ( "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 @@ -152,15 +165,17 @@ asm volatile ( // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #24 \n" "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -172,15 +187,16 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { asm volatile ( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 @@ -232,7 +248,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { asm volatile ( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" @@ -273,29 +290,28 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ); } -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; +static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; +static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; // 32 -> 12 void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { + (void)src_stride; asm volatile ( MEMACCESS(3) "ld1 {v3.16b}, [%3] \n" "1: \n" MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" MEMACCESS(1) @@ -312,7 +328,8 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; @@ -441,7 +458,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, + int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; asm volatile ( @@ -545,8 +563,11 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRows_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, + int src_width, + int src_height) { const uint8* src_tmp; asm volatile ( "1: \n" @@ -578,23 +599,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {v4.b, v5.b}["#n"], [%6] \n" - -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {v4.b, v5.b}[" #n "], [%6] \n" +// clang-format on + +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_ptr; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64)x; + int64 dx64 = (int64)dx; asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -626,8 +656,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "ushll2 v6.4s, v6.8h, #0 \n" "mul v16.4s, v16.4s, v7.4s \n" "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" @@ -654,9 +684,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - int y_fraction = 256 - source_y_fraction; + const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" @@ -752,8 +784,11 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, ); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" // load even pixels into q0, odd into q1 @@ -775,8 +810,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS (0) @@ -802,8 +840,10 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst, + int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" @@ -839,8 +879,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { + (void)src_stride; asm volatile ( "1: \n" MEMACCESS(0) @@ -867,9 +911,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, // Alignment requirement: src_argb 4 byte aligned. // TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8* dst_argb, + int dst_width) { asm volatile ( "add %1, %1, %0 \n" "1: \n" @@ -916,21 +962,26 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld1 {"#vn".s}["#n"], [%6] \n" - -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" +// clang-format on + +void ScaleARGBCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64)x; + int64 dx64 = (int64)dx; int64 tmp64; asm volatile ( "1: \n" @@ -961,23 +1012,28 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, #undef LOAD1_DATA32_LANE +// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(vn1, vn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" - -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" +// clang-format on + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64)x; + int64 dx64 = (int64)dx; asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc index f1709736..0c5b3a1e 100644 --- a/files/source/scale_win.cc +++ b/files/source/scale_win.cc @@ -20,94 +20,89 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -120,27 +115,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -153,20 +149,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] @@ -174,15 +171,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add + paddw xmm0, xmm2 // vertical add paddw xmm1, xmm3 psrlw xmm0, 1 psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -197,23 +194,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -225,30 +223,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x1 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -262,20 +261,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // For rounding, average = (sum + 2) / 4 // becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] @@ -283,18 +283,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm0, ymm0, ymm2 // vertical add vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -308,15 +308,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 @@ -339,50 +340,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -__declspec(naked) -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 movdqa xmm5, xmm4 packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 + psllw xmm5, 3 // constant 0x0008 wloop: - movdqu xmm0, [eax] // average rows + movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm0, xmm2 // vertical add rows 0, 1 paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 + paddw xmm0, xmm2 // add row 2 paddw xmm1, xmm3 movdqu xmm2, [eax + edi] movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 + paddw xmm0, xmm2 // add row 3 paddw xmm1, xmm3 phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] @@ -397,15 +399,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. -__declspec(naked) -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 vpsrld ymm5, ymm5, 24 vpslld ymm5, ymm5, 16 @@ -416,10 +419,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpand ymm0, ymm0, ymm5 vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -431,52 +434,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x4 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpsllw ymm5, ymm4, 3 // constant 0x0008 vpackuswb ymm4, ymm4, ymm4 wloop: - vmovdqu ymm0, [eax] // average rows + vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm0, ymm0, ymm2 // add row 2 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + edi] vmovdqu ymm3, [eax + edi + 32] lea eax, [eax + 64] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm0, ymm0, ymm2 // add row 3 vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -494,14 +498,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm3, xmmword ptr kShuf0 movdqa xmm4, xmmword ptr kShuf1 movdqa xmm5, xmmword ptr kShuf2 @@ -541,16 +546,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -559,7 +564,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -568,7 +573,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 @@ -577,7 +582,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 @@ -598,16 +603,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -616,7 +621,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -626,7 +631,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -636,7 +641,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 @@ -660,26 +665,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm4, xmmword ptr kShuf38a movdqa xmm5, xmmword ptr kShuf38b xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edx], xmm0 // write 12 pixels + movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] @@ -691,23 +697,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAc movdqa xmm3, xmmword ptr kShufAc3 movdqa xmm4, xmmword ptr kScaleAc33 pxor xmm5, xmm5 xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 @@ -725,14 +731,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 paddusw xmm6, xmm0 psrldq xmm0, 2 paddusw xmm6, xmm0 pshufb xmm6, xmm2 - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 paddusw xmm7, xmm1 psrldq xmm1, 2 @@ -740,10 +746,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, pshufb xmm7, xmm3 paddusw xmm6, xmm7 - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 - movd [edx], xmm6 // write 6 pixels + movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] @@ -756,28 +762,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAb0 movdqa xmm3, xmmword ptr kShufAb1 movdqa xmm4, xmmword ptr kShufAb2 movdqa xmm5, xmmword ptr kScaleAb2 xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm1, [eax + esi] lea eax, [eax + 16] pavgb xmm0, xmm1 - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 movdqa xmm6, xmm0 pshufb xmm6, xmm3 @@ -785,10 +791,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, pshufb xmm0, xmm4 paddusw xmm1, xmm0 - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 - movd [edx], xmm1 // write 6 pixels + movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] @@ -801,26 +807,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, + uint16* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 - // sum rows + // sum rows xloop: - movdqu xmm3, [eax] // read 16 bytes + movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm0, [edx] // read 16 words from destination movdqu xmm1, [edx + 16] movdqa xmm2, xmm3 punpcklbw xmm2, xmm5 punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words + paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx], xmm0 // write 16 words to destination movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 16 @@ -831,24 +838,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, + uint16* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 - // sum rows + // sum rows xloop: - vmovdqu ymm3, [eax] // read 32 bytes + vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck vpunpcklbw ymm2, ymm3, ymm5 vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm0, ymm2, [edx] // sum 16 words vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx], ymm0 // write 32 words to destination vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 32 @@ -862,86 +870,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { __asm { push ebx push esi push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width movd xmm2, [esp + 12 + 16] // x movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. + mov eax, 0x04040000 // shuffle to line up fractions with pixel. movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 + pcmpeqb xmm7, xmm7 // generate 0x0001 psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 + pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. + pxor xmm1, xmm6 // 0..7f and 7f..0 + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit + pxor xmm2, xmm6 // 0..7f and 7f..0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits movd ebx, xmm2 mov [edi], bl @@ -955,13 +964,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, + const uint8* src_ptr, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -980,15 +991,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1005,23 +1016,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1033,16 +1044,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1050,11 +1061,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1067,18 +1078,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -__declspec(naked) -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { __asm { push ebx push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1103,21 +1115,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -__declspec(naked) -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1132,11 +1144,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, movq xmm3, qword ptr [esi + ebx * 2] movhps xmm3, qword ptr [esi + edi] lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1151,29 +1163,31 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { __asm { push edi push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. cmp ecx, 0 jle xloop99 @@ -1184,20 +1198,20 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 movd xmm1, [esi + eax * 4] // 1 source x2 pixels movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 movdqu [edi], xmm0 lea edi, [edi + 16] - sub ecx, 4 // 4 pixels + sub ecx, 4 // 4 pixels jge xloop4 xloop49: @@ -1207,8 +1221,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 movq qword ptr [edi], xmm0 lea edi, [edi + 8] @@ -1233,59 +1247,61 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx movdqa xmm4, xmmword ptr kShuffleColARGB movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: @@ -1293,15 +1309,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 xloop99: @@ -1313,13 +1329,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, + const uint8* src_argb, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -1338,12 +1356,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv_X86(int num, int div) { +__declspec(naked) int FixedDiv_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 idiv dword ptr [esp + 8] ret @@ -1351,13 +1368,12 @@ int FixedDiv_X86(int num, int div) { } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv1_X86(int num, int div) { +__declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 sub eax, 0x00010001 sbb edx, 0 diff --git a/files/source/video_common.cc b/files/source/video_common.cc index 00fb71e1..3e9c6a29 100644 --- a/files/source/video_common.cc +++ b/files/source/video_common.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "libyuv/video_common.h" #ifdef __cplusplus @@ -24,24 +23,24 @@ struct FourCCAliasEntry { }; static const struct FourCCAliasEntry kFourCCAliases[] = { - {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, - {FOURCC_YU16, FOURCC_I422}, - {FOURCC_YU24, FOURCC_I444}, - {FOURCC_YUYV, FOURCC_YUY2}, - {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs - {FOURCC_HDYC, FOURCC_UYVY}, - {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 - {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. - {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, // deprecated. - {FOURCC_RGB3, FOURCC_RAW }, - {FOURCC_BGR3, FOURCC_24BG}, - {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB - {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB - {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 - {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 - {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 }; // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. // {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA @@ -62,4 +61,3 @@ uint32 CanonicalFourCC(uint32 fourcc) { } // extern "C" } // namespace libyuv #endif - diff --git a/files/tools_libyuv/OWNERS b/files/tools_libyuv/OWNERS new file mode 100644 index 00000000..aca046d4 --- /dev/null +++ b/files/tools_libyuv/OWNERS @@ -0,0 +1 @@ +kjellander@chromium.org diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py new file mode 100755 index 00000000..a9eb307e --- /dev/null +++ b/files/tools_libyuv/autoroller/roll_deps.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python +# Copyright 2017 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# This is a modified copy of the script in +# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py +# customized for libyuv. + + +"""Script to automatically roll dependencies in the libyuv DEPS file.""" + +import argparse +import base64 +import collections +import logging +import os +import re +import subprocess +import sys +import urllib + + +# Skip these dependencies (list without solution name prefix). +DONT_AUTOROLL_THESE = [ + 'src/third_party/gflags/src', +] + +LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv' +CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src' +CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s' +CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s' +CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s' + +COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$') +CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$') +ROLL_BRANCH_NAME = 'roll_chromium_revision' + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir, + os.pardir)) +CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir)) + +sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build')) +import find_depot_tools +find_depot_tools.add_depot_tools_to_path() +from gclient import GClientKeywords + +CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py' +CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools', + 'clang', 'scripts', 'update.py') + +DepsEntry = collections.namedtuple('DepsEntry', 'path url revision') +ChangedDep = collections.namedtuple('ChangedDep', + 'path url current_rev new_rev') + +class RollError(Exception): + pass + + +def ParseDepsDict(deps_content): + local_scope = {} + var = GClientKeywords.VarImpl({}, local_scope) + global_scope = { + 'From': GClientKeywords.FromImpl, + 'Var': var.Lookup, + 'deps_os': {}, + } + exec(deps_content, global_scope, local_scope) + return local_scope + + +def ParseLocalDepsFile(filename): + with open(filename, 'rb') as f: + deps_content = f.read() + return ParseDepsDict(deps_content) + + +def ParseRemoteCrDepsFile(revision): + deps_content = ReadRemoteCrFile('DEPS', revision) + return ParseDepsDict(deps_content) + + +def ParseCommitPosition(commit_message): + for line in reversed(commit_message.splitlines()): + m = COMMIT_POSITION_RE.match(line.strip()) + if m: + return m.group(1) + logging.error('Failed to parse commit position id from:\n%s\n', + commit_message) + sys.exit(-1) + + +def _RunCommand(command, working_dir=None, ignore_exit_code=False, + extra_env=None): + """Runs a command and returns the output from that command. + + If the command fails (exit code != 0), the function will exit the process. + + Returns: + A tuple containing the stdout and stderr outputs as strings. + """ + working_dir = working_dir or CHECKOUT_SRC_DIR + logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir) + env = os.environ.copy() + if extra_env: + assert all(type(value) == str for value in extra_env.values()) + logging.debug('extra env: %s', extra_env) + env.update(extra_env) + p = subprocess.Popen(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, env=env, + cwd=working_dir, universal_newlines=True) + std_output = p.stdout.read() + err_output = p.stderr.read() + p.wait() + p.stdout.close() + p.stderr.close() + if not ignore_exit_code and p.returncode != 0: + logging.error('Command failed: %s\n' + 'stdout:\n%s\n' + 'stderr:\n%s\n', ' '.join(command), std_output, err_output) + sys.exit(p.returncode) + return std_output, err_output + + +def _GetBranches(): + """Returns a tuple of active,branches. + + The 'active' is the name of the currently active branch and 'branches' is a + list of all branches. + """ + lines = _RunCommand(['git', 'branch'])[0].split('\n') + branches = [] + active = '' + for line in lines: + if '*' in line: + # The assumption is that the first char will always be the '*'. + active = line[1:].strip() + branches.append(active) + else: + branch = line.strip() + if branch: + branches.append(branch) + return active, branches + + +def _ReadGitilesContent(url): + # Download and decode BASE64 content until + # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed. + base64_content = ReadUrlContent(url + '?format=TEXT') + return base64.b64decode(base64_content[0]) + + +def ReadRemoteCrFile(path_below_src, revision): + """Reads a remote Chromium file of a specific revision. Returns a string.""" + return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision, + path_below_src)) + + +def ReadRemoteCrCommit(revision): + """Reads a remote Chromium commit message. Returns a string.""" + return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision) + + +def ReadUrlContent(url): + """Connect to a remote host and read the contents. Returns a list of lines.""" + conn = urllib.urlopen(url) + try: + return conn.readlines() + except IOError as e: + logging.exception('Error connecting to %s. Error: %s', url, e) + raise + finally: + conn.close() + + +def GetMatchingDepsEntries(depsentry_dict, dir_path): + """Gets all deps entries matching the provided path. + + This list may contain more than one DepsEntry object. + Example: dir_path='src/testing' would give results containing both + 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS. + Example 2: dir_path='src/build' should return 'src/build' but not + 'src/buildtools'. + + Returns: + A list of DepsEntry objects. + """ + result = [] + for path, depsentry in depsentry_dict.iteritems(): + if path == dir_path: + result.append(depsentry) + else: + parts = path.split('/') + if all(part == parts[i] + for i, part in enumerate(dir_path.split('/'))): + result.append(depsentry) + return result + + +def BuildDepsentryDict(deps_dict): + """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict.""" + result = {} + def AddDepsEntries(deps_subdict): + for path, deps_url in deps_subdict.iteritems(): + if not result.has_key(path): + url, revision = deps_url.split('@') if deps_url else (None, None) + result[path] = DepsEntry(path, url, revision) + + AddDepsEntries(deps_dict['deps']) + for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']: + AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {})) + return result + + +def CalculateChangedDeps(libyuv_deps, new_cr_deps): + """ + Calculate changed deps entries based on entries defined in the libyuv DEPS + file: + - If a shared dependency with the Chromium DEPS file: roll it to the same + revision as Chromium (i.e. entry in the new_cr_deps dict) + - If it's a Chromium sub-directory, roll it to the HEAD revision (notice + this means it may be ahead of the chromium_revision, but generally these + should be close). + - If it's another DEPS entry (not shared with Chromium), roll it to HEAD + unless it's configured to be skipped. + + Returns: + A list of ChangedDep objects representing the changed deps. + """ + result = [] + libyuv_entries = BuildDepsentryDict(libyuv_deps) + new_cr_entries = BuildDepsentryDict(new_cr_deps) + for path, libyuv_deps_entry in libyuv_entries.iteritems(): + if path in DONT_AUTOROLL_THESE: + continue + cr_deps_entry = new_cr_entries.get(path) + if cr_deps_entry: + # Use the revision from Chromium's DEPS file. + new_rev = cr_deps_entry.revision + assert libyuv_deps_entry.url == cr_deps_entry.url, ( + 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' % + (path, libyuv_deps_entry.url, cr_deps_entry.url)) + else: + # Use the HEAD of the deps repo. + stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url, + 'HEAD']) + new_rev = stdout.strip().split('\t')[0] + + # Check if an update is necessary. + if libyuv_deps_entry.revision != new_rev: + logging.debug('Roll dependency %s to %s', path, new_rev) + result.append(ChangedDep(path, libyuv_deps_entry.url, + libyuv_deps_entry.revision, new_rev)) + return sorted(result) + + +def CalculateChangedClang(new_cr_rev): + def GetClangRev(lines): + for line in lines: + match = CLANG_REVISION_RE.match(line) + if match: + return match.group(1) + raise RollError('Could not parse Clang revision!') + + with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f: + current_lines = f.readlines() + current_rev = GetClangRev(current_lines) + + new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH, + new_cr_rev).splitlines() + new_rev = GetClangRev(new_clang_update_py) + return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev) + + +def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos, + new_commit_pos, changed_deps_list, clang_change): + current_cr_rev = current_cr_rev[0:10] + new_cr_rev = new_cr_rev[0:10] + rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev) + git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos) + + commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval, + git_number_interval)] + commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval)) + commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % + rev_interval)) + # TBR field will be empty unless in some custom cases, where some engineers + # are added. + tbr_authors = '' + if changed_deps_list: + commit_msg.append('Changed dependencies:') + + for c in changed_deps_list: + commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url, + c.current_rev[0:10], + c.new_rev[0:10])) + change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS') + commit_msg.append('DEPS diff: %s\n' % change_url) + else: + commit_msg.append('No dependencies changed.') + + if clang_change.current_rev != clang_change.new_rev: + commit_msg.append('Clang version changed %s:%s' % + (clang_change.current_rev, clang_change.new_rev)) + change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, + CLANG_UPDATE_SCRIPT_URL_PATH) + commit_msg.append('Details: %s\n' % change_url) + else: + commit_msg.append('No update to Clang.\n') + + commit_msg.append('TBR=%s' % tbr_authors) + commit_msg.append('BUG=None') + return '\n'.join(commit_msg) + + +def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision, + changed_deps): + """Update the DEPS file with the new revision.""" + + # Update the chromium_revision variable. + with open(deps_filename, 'rb') as deps_file: + deps_content = deps_file.read() + deps_content = deps_content.replace(old_cr_revision, new_cr_revision) + with open(deps_filename, 'wb') as deps_file: + deps_file.write(deps_content) + + # Update each individual DEPS entry. + for dep in changed_deps: + local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path) + if not os.path.isdir(local_dep_dir): + raise RollError( + 'Cannot find local directory %s. Either run\n' + 'gclient sync --deps=all\n' + 'or make sure the .gclient file for your solution contains all ' + 'platforms in the target_os list, i.e.\n' + 'target_os = ["android", "unix", "mac", "ios", "win"];\n' + 'Then run "gclient sync" again.' % local_dep_dir) + _, stderr = _RunCommand( + ['roll-dep-svn', '--no-verify-revision', dep.path, dep.new_rev], + working_dir=CHECKOUT_SRC_DIR, ignore_exit_code=True) + if stderr: + logging.warning('roll-dep-svn: %s', stderr) + + +def _IsTreeClean(): + stdout, _ = _RunCommand(['git', 'status', '--porcelain']) + if len(stdout) == 0: + return True + + logging.error('Dirty/unversioned files:\n%s', stdout) + return False + + +def _EnsureUpdatedMasterBranch(dry_run): + current_branch = _RunCommand( + ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0] + if current_branch != 'master': + logging.error('Please checkout the master branch and re-run this script.') + if not dry_run: + sys.exit(-1) + + logging.info('Updating master branch...') + _RunCommand(['git', 'pull']) + + +def _CreateRollBranch(dry_run): + logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME) + if not dry_run: + _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME]) + + +def _RemovePreviousRollBranch(dry_run): + active_branch, branches = _GetBranches() + if active_branch == ROLL_BRANCH_NAME: + active_branch = 'master' + if ROLL_BRANCH_NAME in branches: + logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME) + if not dry_run: + _RunCommand(['git', 'checkout', active_branch]) + _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME]) + + +def _LocalCommit(commit_msg, dry_run): + logging.info('Committing changes locally.') + if not dry_run: + _RunCommand(['git', 'add', '--update', '.']) + _RunCommand(['git', 'commit', '-m', commit_msg]) + + +def _UploadCL(dry_run, rietveld_email=None): + logging.info('Uploading CL...') + if not dry_run: + cmd = ['git', 'cl', 'upload', '-f'] + if rietveld_email: + cmd.append('--email=%s' % rietveld_email) + _RunCommand(cmd, extra_env={'EDITOR': 'true'}) + + +def _SendToCQ(dry_run, skip_cq): + logging.info('Sending the CL to the CQ...') + if not dry_run and not skip_cq: + _RunCommand(['git', 'cl', 'set_commit']) + logging.info('Sent the CL to the CQ.') + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--clean', action='store_true', default=False, + help='Removes any previous local roll branch.') + p.add_argument('-r', '--revision', + help=('Chromium Git revision to roll to. Defaults to the ' + 'Chromium HEAD revision if omitted.')) + p.add_argument('-u', '--rietveld-email', + help=('E-mail address to use for creating the CL at Rietveld' + 'If omitted a previously cached one will be used or an ' + 'error will be thrown during upload.')) + p.add_argument('--dry-run', action='store_true', default=False, + help=('Calculate changes and modify DEPS, but don\'t create ' + 'any local branch, commit, upload CL or send any ' + 'tryjobs.')) + p.add_argument('-i', '--ignore-unclean-workdir', action='store_true', + default=False, + help=('Ignore if the current branch is not master or if there ' + 'are uncommitted changes (default: %(default)s).')) + p.add_argument('--skip-cq', action='store_true', default=False, + help='Skip sending the CL to the CQ (default: %(default)s)') + p.add_argument('-v', '--verbose', action='store_true', default=False, + help='Be extra verbose in printing of log messages.') + opts = p.parse_args() + + if opts.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + if not opts.ignore_unclean_workdir and not _IsTreeClean(): + logging.error('Please clean your local checkout first.') + return 1 + + if opts.clean: + _RemovePreviousRollBranch(opts.dry_run) + + if not opts.ignore_unclean_workdir: + _EnsureUpdatedMasterBranch(opts.dry_run) + + new_cr_rev = opts.revision + if not new_cr_rev: + stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD']) + head_rev = stdout.strip().split('\t')[0] + logging.info('No revision specified. Using HEAD: %s', head_rev) + new_cr_rev = head_rev + + deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS') + libyuv_deps = ParseLocalDepsFile(deps_filename) + current_cr_rev = libyuv_deps['vars']['chromium_revision'] + + current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev)) + new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev)) + + new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev) + changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps) + clang_change = CalculateChangedClang(new_cr_rev) + commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev, + current_commit_pos, new_commit_pos, + changed_deps, clang_change) + logging.debug('Commit message:\n%s', commit_msg) + + _CreateRollBranch(opts.dry_run) + UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps) + _LocalCommit(commit_msg, opts.dry_run) + _UploadCL(opts.dry_run, opts.rietveld_email) + _SendToCQ(opts.dry_run, opts.skip_cq) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/files/tools_libyuv/autoroller/unittests/.DS_Store b/files/tools_libyuv/autoroller/unittests/.DS_Store Binary files differnew file mode 100644 index 00000000..70369d69 --- /dev/null +++ b/files/tools_libyuv/autoroller/unittests/.DS_Store diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py new file mode 100755 index 00000000..025e46e1 --- /dev/null +++ b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +# Copyright 2017 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +import glob +import os +import shutil +import sys +import tempfile +import unittest + + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir) +sys.path.append(PARENT_DIR) +import roll_deps +from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \ + ParseDepsDict, ParseLocalDepsFile, UpdateDepsFile + + +TEST_DATA_VARS = { + 'chromium_git': 'https://chromium.googlesource.com', + 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d', +} + +DEPS_ENTRIES = { + 'src/build': 'https://build.com', + 'src/buildtools': 'https://buildtools.com', + 'src/testing/gtest': 'https://gtest.com', + 'src/testing/gmock': 'https://gmock.com', +} + +BUILD_OLD_REV = '52f7afeca991d96d68cf0507e20dbdd5b845691f' +BUILD_NEW_REV = 'HEAD' +BUILDTOOLS_OLD_REV = '64e38f0cebdde27aa0cfb405f330063582f9ac76' +BUILDTOOLS_NEW_REV = '55ad626b08ef971fd82a62b7abb325359542952b' + + +class TestError(Exception): + pass + + +class FakeCmd(object): + def __init__(self): + self.expectations = [] + + def add_expectation(self, *args, **kwargs): + returns = kwargs.pop('_returns', None) + self.expectations.append((args, kwargs, returns)) + + def __call__(self, *args, **kwargs): + if not self.expectations: + raise TestError('Got unexpected\n%s\n%s' % (args, kwargs)) + exp_args, exp_kwargs, exp_returns = self.expectations.pop(0) + if args != exp_args or kwargs != exp_kwargs: + message = 'Expected:\n args: %s\n kwargs: %s\n' % (exp_args, exp_kwargs) + message += 'Got:\n args: %s\n kwargs: %s\n' % (args, kwargs) + raise TestError(message) + return exp_returns + + +class TestRollChromiumRevision(unittest.TestCase): + def setUp(self): + self._output_dir = tempfile.mkdtemp() + for test_file in glob.glob(os.path.join(SCRIPT_DIR, 'testdata', '*')): + shutil.copy(test_file, self._output_dir) + self._libyuv_depsfile = os.path.join(self._output_dir, 'DEPS') + self._old_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.old') + self._new_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.new') + + self.fake = FakeCmd() + self.old_RunCommand = getattr(roll_deps, '_RunCommand') + setattr(roll_deps, '_RunCommand', self.fake) + + def tearDown(self): + shutil.rmtree(self._output_dir, ignore_errors=True) + self.assertEqual(self.fake.expectations, []) + setattr(roll_deps, '_RunCommand', self.old_RunCommand) + + def testUpdateDepsFile(self): + new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111' + + current_rev = TEST_DATA_VARS['chromium_revision'] + UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, []) + with open(self._libyuv_depsfile) as deps_file: + deps_contents = deps_file.read() + self.assertTrue(new_rev in deps_contents, + 'Failed to find %s in\n%s' % (new_rev, deps_contents)) + + def testParseDepsDict(self): + with open(self._libyuv_depsfile) as deps_file: + deps_contents = deps_file.read() + local_scope = ParseDepsDict(deps_contents) + vars_dict = local_scope['vars'] + + def assertVar(variable_name): + self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name]) + assertVar('chromium_git') + assertVar('chromium_revision') + self.assertEquals(len(local_scope['deps']), 3) + + def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self): + entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest') + self.assertEquals(len(entries), 1) + self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest']) + + def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self): + entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing') + self.assertEquals(len(entries), 2) + + def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self): + entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build') + self.assertEquals(len(entries), 1) + self.assertEquals(entries[0], DEPS_ENTRIES['src/build']) + + def testCalculateChangedDeps(self): + _SetupGitLsRemoteCall(self.fake, + 'https://chromium.googlesource.com/chromium/src/build', BUILD_NEW_REV) + libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile) + new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile) + changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps) + self.assertEquals(len(changed_deps), 2) + self.assertEquals(changed_deps[0].path, 'src/build') + self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV) + self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV) + + self.assertEquals(changed_deps[1].path, 'src/buildtools') + self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV) + self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV) + + +def _SetupGitLsRemoteCall(cmd_fake, url, revision): + cmd = ['git', 'ls-remote', url, revision] + cmd_fake.add_expectation(cmd, _returns=(revision, None)) + + +if __name__ == '__main__': + unittest.main() diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS b/files/tools_libyuv/autoroller/unittests/testdata/DEPS new file mode 100644 index 00000000..9fbb48a7 --- /dev/null +++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS @@ -0,0 +1,20 @@ +# DEPS file for unit tests. + +vars = { + 'chromium_git': 'https://chromium.googlesource.com', + 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d', +} + +deps = { + # Entry that is a directory in Chromium, so we're using a Git subtree mirror for it. + 'src/build': + Var('chromium_git') + '/chromium/src/build' + '@' + '52f7afeca991d96d68cf0507e20dbdd5b845691f', + + # Entry that's also a DEPS entry in the Chromium DEPS file. + 'src/buildtools': + Var('chromium_git') + '/chromium/buildtools.git' + '@' + '64e38f0cebdde27aa0cfb405f330063582f9ac76', + + # Entry only present in libyuv, not Chromium. + 'src/third_party/gflags/src': + Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca', +} diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new new file mode 100644 index 00000000..d53083ce --- /dev/null +++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new @@ -0,0 +1,13 @@ +# DEPS file for unit tests. + +vars = { + 'chromium_git': 'https://chromium.googlesource.com', + + # This is updated compared to the DEPS.chromium.old file. + 'buildtools_revision': '55ad626b08ef971fd82a62b7abb325359542952b', +} + +deps = { + 'src/buildtools': + Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'), +} diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old new file mode 100644 index 00000000..dd6ddaec --- /dev/null +++ b/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old @@ -0,0 +1,13 @@ +# DEPS file for unit tests. + +vars = { + 'chromium_git': 'https://chromium.googlesource.com', + + # This is and older revision than DEPS.chromium.new file. + 'buildtools_revision': '64e38f0cebdde27aa0cfb405f330063582f9ac76', +} + +deps = { + 'src/buildtools': + Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'), +} diff --git a/files/tools_libyuv/get_landmines.py b/files/tools_libyuv/get_landmines.py new file mode 100755 index 00000000..3dc78bb9 --- /dev/null +++ b/files/tools_libyuv/get_landmines.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Copyright 2016 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +""" +This file emits the list of reasons why a particular build needs to be clobbered +(or a list of 'landmines'). +""" + +import os +import sys + +script_dir = os.path.dirname(os.path.realpath(__file__)) +checkout_root = os.path.abspath(os.path.join(script_dir, os.pardir)) +sys.path.insert(0, os.path.join(checkout_root, 'build')) +import landmine_utils + + +distributor = landmine_utils.distributor +gyp_defines = landmine_utils.gyp_defines +gyp_msvs_version = landmine_utils.gyp_msvs_version +platform = landmine_utils.platform + + +def print_landmines(): + """ + ALL LANDMINES ARE EMITTED FROM HERE. + """ + # DO NOT add landmines as part of a regular CL. Landmines are a last-effort + # bandaid fix if a CL that got landed has a build dependency bug and all bots + # need to be cleaned up. If you're writing a new CL that causes build + # dependency problems, fix the dependency problems instead of adding a + # landmine. + # See the Chromium version in src/build/get_landmines.py for usage examples. + print 'Clobber to remove GYP artifacts after switching bots to GN.' + print 'Another try to remove GYP artifacts after switching bots to GN.' + + +def main(): + print_landmines() + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/files/tools_libyuv/msan/OWNERS b/files/tools_libyuv/msan/OWNERS new file mode 100644 index 00000000..60351e7e --- /dev/null +++ b/files/tools_libyuv/msan/OWNERS @@ -0,0 +1,3 @@ +pbos@chromium.org +kjellander@chromium.org + diff --git a/files/tools_libyuv/msan/blacklist.txt b/files/tools_libyuv/msan/blacklist.txt new file mode 100644 index 00000000..8b5e42a7 --- /dev/null +++ b/files/tools_libyuv/msan/blacklist.txt @@ -0,0 +1,9 @@ +# The rules in this file are only applied at compile time. +# Because the Chrome buildsystem does not automatically touch the files +# mentioned here, changing this file requires clobbering all MSan bots. +# +# Please think twice before you add or remove these rules. + +# This is a stripped down copy of Chromium's blacklist.txt, to enable +# adding libyuv-specific blacklist entries. + diff --git a/files/tools_libyuv/ubsan/OWNERS b/files/tools_libyuv/ubsan/OWNERS new file mode 100644 index 00000000..b608519a --- /dev/null +++ b/files/tools_libyuv/ubsan/OWNERS @@ -0,0 +1,4 @@ +pbos@webrtc.org +kjellander@webrtc.org +fbarchard@chromium.org + diff --git a/files/tools_libyuv/ubsan/blacklist.txt b/files/tools_libyuv/ubsan/blacklist.txt new file mode 100644 index 00000000..8bcb2907 --- /dev/null +++ b/files/tools_libyuv/ubsan/blacklist.txt @@ -0,0 +1,15 @@ +############################################################################# +# UBSan blacklist. +# Please think twice before you add or remove these rules. + +# This is a stripped down copy of Chromium's blacklist.txt, to enable +# adding WebRTC-specific blacklist entries. + +############################################################################# +# YASM does some funny things that UBsan doesn't like. +# https://crbug.com/489901 +src:*/third_party/yasm/* + +############################################################################# +# Ignore system libraries. +src:*/usr/* diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/files/tools_libyuv/ubsan/vptr_blacklist.txt new file mode 100644 index 00000000..8ed070c0 --- /dev/null +++ b/files/tools_libyuv/ubsan/vptr_blacklist.txt @@ -0,0 +1,21 @@ +############################################################################# +# UBSan vptr blacklist. +# Function and type based blacklisting use a mangled name, and it is especially +# tricky to represent C++ types. For now, any possible changes by name manglings +# are simply represented as wildcard expressions of regexp, and thus it might be +# over-blacklisted. +# +# Please think twice before you add or remove these rules. +# +# This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable +# adding libyuv-specific blacklist entries. + +############################################################################# +# Using raw pointer values. +# +# A raw pointer value (16) is used to infer the field offset by +# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET. + +# Example: +# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc + diff --git a/files/tools_libyuv/valgrind/libyuv_tests.bat b/files/tools_libyuv/valgrind/libyuv_tests.bat new file mode 100644 index 00000000..e37f09eb --- /dev/null +++ b/files/tools_libyuv/valgrind/libyuv_tests.bat @@ -0,0 +1,79 @@ +@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS. All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+:: it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.py b/files/tools_libyuv/valgrind/libyuv_tests.py new file mode 100755 index 00000000..e780bd95 --- /dev/null +++ b/files/tools_libyuv/valgrind/libyuv_tests.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +"""Runs various libyuv tests through valgrind_test.py. + +This script inherits the chrome_tests.py in Chrome, but allows running any test +instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and +only supports specifying a single test for each run. + +Suppression files: +The Chrome valgrind directory we use as a DEPS dependency contains the following +suppression files: + valgrind/memcheck/suppressions.txt + valgrind/memcheck/suppressions_mac.txt + valgrind/tsan/suppressions.txt + valgrind/tsan/suppressions_mac.txt + valgrind/tsan/suppressions_win32.txt +Since they're referenced from the chrome_tests.py script, we have similar files +below the directory of this script. When executing, this script will setup both +Chrome's suppression files and our own, so we can easily maintain libyuv +specific suppressions in our own files. +""" + +import logging +import optparse +import os +import sys + +import logging_utils +import path_utils + +import chrome_tests + + +class LibyuvTest(chrome_tests.ChromeTests): + """Class that handles setup of suppressions for libyuv. + + Everything else is inherited from chrome_tests.ChromeTests. + """ + + def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None): + """Override command-building method so we can add more suppressions.""" + cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe, + valgrind_test_args) + # When ChromeTests._DefaultCommand has executed, it has setup suppression + # files based on what's found in the memcheck/ or tsan/ subdirectories of + # this script's location. If Mac or Windows is executing, additional + # platform specific files have also been added. + # Since only the ones located below this directory is added, we must also + # add the ones maintained by Chrome, located in ../../tools/valgrind. + + # The idea is to look for --suppression arguments in the cmd list and add a + # modified copy of each suppression file, for the corresponding file in + # ../../tools/valgrind. + script_dir = path_utils.ScriptDir() + old_base, _ = os.path.split(script_dir) + + checkout_src = os.path.abspath(os.path.join(script_dir, os.pardir, + os.pardir)) + new_dir = os.path.join(checkout_src, 'tools', 'valgrind') + add_suppressions = [] + for token in cmd: + if '--suppressions' in token: + add_suppressions.append(token.replace(script_dir, new_dir)) + return add_suppressions + cmd + + +def main(_): + parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>') + parser.disable_interspersed_args() + parser.add_option('-b', '--build-dir', + help=('Location of the compiler output. Can only be used ' + 'when the test argument does not contain this path.')) + parser.add_option("--target", help="Debug or Release") + parser.add_option('-t', '--test', help='Test to run.') + parser.add_option('', '--baseline', action='store_true', default=False, + help='Generate baseline data instead of validating') + parser.add_option('', '--gtest_filter', + help='Additional arguments to --gtest_filter') + parser.add_option('', '--gtest_repeat', + help='Argument for --gtest_repeat') + parser.add_option("--gtest_shuffle", action="store_true", default=False, + help="Randomize tests' orders on every iteration.") + parser.add_option("--gtest_break_on_failure", action="store_true", + default=False, + help="Drop in to debugger on assertion failure. Also " + "useful for forcing tests to exit with a stack dump " + "on the first assertion failure when running with " + "--gtest_repeat=-1") + parser.add_option('-v', '--verbose', action='store_true', default=False, + help='Verbose output - enable debug log messages') + parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck', + help='Specify a valgrind tool to run the tests under') + parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='', + help='Specify custom flags for the selected valgrind tool') + parser.add_option('', '--keep_logs', action='store_true', default=False, + help=('Store memory tool logs in the <tool>.logs directory ' + 'instead of /tmp.\nThis can be useful for tool ' + 'developers/maintainers.\nPlease note that the <tool>' + '.logs directory will be clobbered on tool startup.')) + parser.add_option("--test-launcher-bot-mode", action="store_true", + help="run the tests with --test-launcher-bot-mode") + parser.add_option("--test-launcher-total-shards", type=int, + help="run the tests with --test-launcher-total-shards") + parser.add_option("--test-launcher-shard-index", type=int, + help="run the tests with --test-launcher-shard-index") + options, args = parser.parse_args() + + if options.verbose: + logging_utils.config_root(logging.DEBUG) + else: + logging_utils.config_root() + + if not options.test: + parser.error('--test not specified') + + # Support build dir both with and without the target. + if (options.target and options.build_dir and + not options.build_dir.endswith(options.target)): + options.build_dir = os.path.join(options.build_dir, options.target) + + # If --build_dir is provided, prepend it to the test executable if needed. + test_executable = options.test + if options.build_dir and not test_executable.startswith(options.build_dir): + test_executable = os.path.join(options.build_dir, test_executable) + args = [test_executable] + args + + test = LibyuvTest(options, args, 'cmdline') + return test.Run() + +if __name__ == '__main__': + return_code = main(sys.argv) + sys.exit(return_code) diff --git a/files/tools_libyuv/valgrind/libyuv_tests.sh b/files/tools_libyuv/valgrind/libyuv_tests.sh new file mode 100755 index 00000000..975b5e3e --- /dev/null +++ b/files/tools_libyuv/valgrind/libyuv_tests.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# Set up some paths and re-direct the arguments to libyuv_tests.py + +# This script is a copy of the chrome_tests.sh wrapper script with the following +# changes: +# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate +# the Valgrind framework install. If it fails a fallback path is used instead +# (../../chromium/src/third_party/valgrind/linux_x64) and a warning message +# is showed by |show_locate_valgrind_failed_warning|. +# - libyuv_tests.py is invoked instead of chrome_tests.py. +# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it +# possible to execute the Python scripts properly. + +export THISDIR=`dirname $0` +ARGV_COPY="$@" + +# We need to set CHROME_VALGRIND iff using Memcheck: +# tools_libyuv/valgrind/libyuv_tests.sh --tool memcheck +# or +# tools_libyuv/valgrind/libyuv_tests.sh --tool=memcheck +tool="memcheck" # Default to memcheck. +while (( "$#" )) +do + if [[ "$1" == "--tool" ]] + then + tool="$2" + shift + elif [[ "$1" =~ --tool=(.*) ]] + then + tool="${BASH_REMATCH[1]}" + fi + shift +done + +NEEDS_VALGRIND=0 + +case "$tool" in + "memcheck") + NEEDS_VALGRIND=1 + ;; +esac + +# For libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind +# scripts dir to locate the Valgrind framework install +CHROME_VALGRIND_SCRIPTS=$THISDIR/../../tools/valgrind + +if [ "$NEEDS_VALGRIND" == "1" ] +then + CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh` + if [ "$CHROME_VALGRIND" = "" ] + then + CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64 + echo + echo "-------------------- WARNING ------------------------" + echo "locate_valgrind.sh failed." + echo "Using $CHROME_VALGRIND as a fallback location." + echo "This might be because:" + echo "1) This is a swarming bot" + echo "2) You haven't set up the valgrind binaries correctly." + echo "In this case, please make sure you have followed the instructions at" + echo "http://www.chromium.org/developers/how-tos/using-valgrind/get-valgrind" + echo "Notice: In the .gclient file, you need to add this for the 'libyuv'" + echo "solution since our directory structure is different from Chromium's:" + echo "\"custom_deps\": {" + echo " \"libyuv/third_party/valgrind\":" + echo " \"https://chromium.googlesource.com/chromium/deps/valgrind/binaries\"," + echo "}," + echo "-----------------------------------------------------" + echo + fi + echo "Using valgrind binaries from ${CHROME_VALGRIND}" + + PATH="${CHROME_VALGRIND}/bin:$PATH" + # We need to set these variables to override default lib paths hard-coded into + # Valgrind binary. + export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind" + export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind" + + # Clean up some /tmp directories that might be stale due to interrupted + # chrome_tests.py execution. + # FYI: + # -mtime +1 <- only print files modified more than 24h ago, + # -print0/-0 are needed to handle possible newlines in the filenames. + echo "Cleanup /tmp from Valgrind stuff" + find /tmp -maxdepth 1 \(\ + -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \ + \) -mtime +1 -print0 | xargs -0 rm -rf +fi + +# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains +# the scripts that are needed for this script to run +PYTHONPATH=$THISDIR/../../tools/python/google:$CHROME_VALGRIND_SCRIPTS python \ + "$THISDIR/libyuv_tests.py" $ARGV_COPY diff --git a/files/tools_libyuv/valgrind/memcheck/OWNERS b/files/tools_libyuv/valgrind/memcheck/OWNERS new file mode 100644 index 00000000..72e8ffc0 --- /dev/null +++ b/files/tools_libyuv/valgrind/memcheck/OWNERS @@ -0,0 +1 @@ +* diff --git a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py new file mode 100644 index 00000000..03329214 --- /dev/null +++ b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +""" +Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py + +See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts +for more details on the presubmit API built into gcl. +""" + +import os +import re +import sys + +def CheckChange(input_api, output_api): + """Checks the memcheck suppressions files for bad data.""" + + # Add the path to the Chrome valgrind dir to the import path: + tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..', '..', + 'tools', 'valgrind') + sys.path.append(tools_vg_path) + import suppressions + + sup_regex = re.compile('suppressions.*\.txt$') + suppressions = {} + errors = [] + check_for_memcheck = False + # skip_next_line has 3 possible values: + # - False: don't skip the next line. + # - 'skip_suppression_name': the next line is a suppression name, skip. + # - 'skip_param': the next line is a system call parameter error, skip. + skip_next_line = False + for f in filter(lambda x: sup_regex.search(x.LocalPath()), + input_api.AffectedFiles()): + for line, line_num in zip(f.NewContents(), + xrange(1, len(f.NewContents()) + 1)): + line = line.lstrip() + if line.startswith('#') or not line: + continue + + if skip_next_line: + if skip_next_line == 'skip_suppression_name': + if 'insert_a_suppression_name_here' in line: + errors.append('"insert_a_suppression_name_here" is not a valid ' + 'suppression name') + if suppressions.has_key(line): + if f.LocalPath() == suppressions[line][1]: + errors.append('suppression with name "%s" at %s line %s ' + 'has already been defined at line %s' % + (line, f.LocalPath(), line_num, + suppressions[line][1])) + else: + errors.append('suppression with name "%s" at %s line %s ' + 'has already been defined at %s line %s' % + (line, f.LocalPath(), line_num, + suppressions[line][0], suppressions[line][1])) + else: + suppressions[line] = (f, line_num) + check_for_memcheck = True; + skip_next_line = False + continue + if check_for_memcheck: + if not line.startswith('Memcheck:'): + errors.append('"%s" should be "Memcheck:..." in %s line %s' % + (line, f.LocalPath(), line_num)) + check_for_memcheck = False; + if line == '{': + skip_next_line = 'skip_suppression_name' + continue + if line == "Memcheck:Param": + skip_next_line = 'skip_param' + continue + + if (line.startswith('fun:') or line.startswith('obj:') or + line.startswith('Memcheck:') or line == '}' or + line == '...'): + continue + errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(), + line_num)) + if errors: + return [output_api.PresubmitError('\n'.join(errors))] + return [] + +def CheckChangeOnUpload(input_api, output_api): + return CheckChange(input_api, output_api) + +def CheckChangeOnCommit(input_api, output_api): + return CheckChange(input_api, output_api) + +def GetPreferredTrySlaves(): + # We don't have any memcheck slaves yet, so there's no use for this method. + # When we have, the slave name(s) should be put into this list. + return [] diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions.txt b/files/tools_libyuv/valgrind/memcheck/suppressions.txt new file mode 100644 index 00000000..3ad0c8cc --- /dev/null +++ b/files/tools_libyuv/valgrind/memcheck/suppressions.txt @@ -0,0 +1,5 @@ +# This file is used in addition to the one already maintained in Chrome. +# It acts as a place holder for future additions for this project. +# It must exist for the Python wrapper script to work properly. + + diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt new file mode 100644 index 00000000..3ad0c8cc --- /dev/null +++ b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt @@ -0,0 +1,5 @@ +# This file is used in addition to the one already maintained in Chrome. +# It acts as a place holder for future additions for this project. +# It must exist for the Python wrapper script to work properly. + + diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt new file mode 100644 index 00000000..3ad0c8cc --- /dev/null +++ b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt @@ -0,0 +1,5 @@ +# This file is used in addition to the one already maintained in Chrome. +# It acts as a place holder for future additions for this project. +# It must exist for the Python wrapper script to work properly. + + diff --git a/files/unit_test/color_test.cc b/files/unit_test/color_test.cc index 36041d99..0aa7a54a 100644 --- a/files/unit_test/color_test.cc +++ b/files/unit_test/color_test.cc @@ -10,13 +10,13 @@ #include <stdlib.h> +#include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" -#include "../unit_test/unit_test.h" namespace libyuv { @@ -38,110 +38,103 @@ namespace libyuv { #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ TEST_F(LibYUVColorTest, TESTNAME) { \ - const int kPixels = benchmark_width_ * benchmark_height_; \ - const int kHalfPixels = ((benchmark_width_ + 1) / 2) * \ - ((benchmark_height_ + HS1) / HS); \ - align_buffer_page_end(orig_y, kPixels); \ - align_buffer_page_end(orig_u, kHalfPixels); \ - align_buffer_page_end(orig_v, kHalfPixels); \ - align_buffer_page_end(orig_pixels, kPixels * 4); \ - align_buffer_page_end(temp_y, kPixels); \ - align_buffer_page_end(temp_u, kHalfPixels); \ - align_buffer_page_end(temp_v, kHalfPixels); \ - align_buffer_page_end(dst_pixels_opt, kPixels * 4); \ - align_buffer_page_end(dst_pixels_c, kPixels * 4); \ + const int kPixels = benchmark_width_ * benchmark_height_; \ + const int kHalfPixels = \ + ((benchmark_width_ + 1) / 2) * ((benchmark_height_ + HS1) / HS); \ + align_buffer_page_end(orig_y, kPixels); \ + align_buffer_page_end(orig_u, kHalfPixels); \ + align_buffer_page_end(orig_v, kHalfPixels); \ + align_buffer_page_end(orig_pixels, kPixels * 4); \ + align_buffer_page_end(temp_y, kPixels); \ + align_buffer_page_end(temp_u, kHalfPixels); \ + align_buffer_page_end(temp_v, kHalfPixels); \ + align_buffer_page_end(dst_pixels_opt, kPixels * 4); \ + align_buffer_page_end(dst_pixels_c, kPixels * 4); \ \ - MemRandomize(orig_pixels, kPixels * 4); \ - MemRandomize(orig_y, kPixels); \ - MemRandomize(orig_u, kHalfPixels); \ - MemRandomize(orig_v, kHalfPixels); \ - MemRandomize(temp_y, kPixels); \ - MemRandomize(temp_u, kHalfPixels); \ - MemRandomize(temp_v, kHalfPixels); \ - MemRandomize(dst_pixels_opt, kPixels * 4); \ - MemRandomize(dst_pixels_c, kPixels * 4); \ + MemRandomize(orig_pixels, kPixels * 4); \ + MemRandomize(orig_y, kPixels); \ + MemRandomize(orig_u, kHalfPixels); \ + MemRandomize(orig_v, kHalfPixels); \ + MemRandomize(temp_y, kPixels); \ + MemRandomize(temp_u, kHalfPixels); \ + MemRandomize(temp_v, kHalfPixels); \ + MemRandomize(dst_pixels_opt, kPixels * 4); \ + MemRandomize(dst_pixels_c, kPixels * 4); \ \ - /* The test is overall for color conversion matrix being reversible, so */ \ - /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \ - uint8* p = orig_y; \ - for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \ - for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ - uint8 r = static_cast<uint8>(fastrand()); \ - p[0] = r; \ - p[1] = r; \ - p[HN] = r; \ - p[HN + 1] = r; \ - p += 2; \ - } \ - if (benchmark_width_ & 1) { \ - uint8 r = static_cast<uint8>(fastrand()); \ - p[0] = r; \ - p[HN] = r; \ - p += 1; \ + /* The test is overall for color conversion matrix being reversible, so */ \ + /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \ + uint8* p = orig_y; \ + for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \ + for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ + uint8 r = static_cast<uint8>(fastrand()); \ + p[0] = r; \ + p[1] = r; \ + p[HN] = r; \ + p[HN + 1] = r; \ + p += 2; \ + } \ + if (benchmark_width_ & 1) { \ + uint8 r = static_cast<uint8>(fastrand()); \ + p[0] = r; \ + p[HN] = r; \ + p += 1; \ + } \ + p += HN; \ } \ - p += HN; \ - } \ - if ((benchmark_height_ & 1) && HS == 2) { \ - for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ - uint8 r = static_cast<uint8>(fastrand()); \ - p[0] = r; \ - p[1] = r; \ - p += 2; \ + if ((benchmark_height_ & 1) && HS == 2) { \ + for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ + uint8 r = static_cast<uint8>(fastrand()); \ + p[0] = r; \ + p[1] = r; \ + p += 2; \ + } \ + if (benchmark_width_ & 1) { \ + uint8 r = static_cast<uint8>(fastrand()); \ + p[0] = r; \ + p += 1; \ + } \ } \ - if (benchmark_width_ & 1) { \ - uint8 r = static_cast<uint8>(fastrand()); \ - p[0] = r; \ - p += 1; \ - } \ - } \ - /* Start with YUV converted to ARGB. */ \ - YUVTOARGB(orig_y, benchmark_width_, \ - orig_u, (benchmark_width_ + 1) / 2, \ - orig_v, (benchmark_width_ + 1) / 2, \ - orig_pixels, benchmark_width_ * 4, \ - benchmark_width_, benchmark_height_); \ + /* Start with YUV converted to ARGB. */ \ + YUVTOARGB(orig_y, benchmark_width_, orig_u, (benchmark_width_ + 1) / 2, \ + orig_v, (benchmark_width_ + 1) / 2, orig_pixels, \ + benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ \ - ARGBTOYUV(orig_pixels, benchmark_width_ * 4, \ - temp_y, benchmark_width_, \ - temp_u, (benchmark_width_ + 1) / 2, \ - temp_v, (benchmark_width_ + 1) / 2, \ - benchmark_width_, benchmark_height_); \ + ARGBTOYUV(orig_pixels, benchmark_width_ * 4, temp_y, benchmark_width_, \ + temp_u, (benchmark_width_ + 1) / 2, temp_v, \ + (benchmark_width_ + 1) / 2, benchmark_width_, \ + benchmark_height_); \ \ - MaskCpuFlags(disable_cpu_flags_); \ - YUVTOARGB(temp_y, benchmark_width_, \ - temp_u, (benchmark_width_ + 1) / 2, \ - temp_v, (benchmark_width_ + 1) / 2, \ - dst_pixels_c, benchmark_width_ * 4, \ - benchmark_width_, benchmark_height_); \ - MaskCpuFlags(benchmark_cpu_info_); \ + MaskCpuFlags(disable_cpu_flags_); \ + YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \ + temp_v, (benchmark_width_ + 1) / 2, dst_pixels_c, \ + benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ + MaskCpuFlags(benchmark_cpu_info_); \ \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - YUVTOARGB(temp_y, benchmark_width_, \ - temp_u, (benchmark_width_ + 1) / 2, \ - temp_v, (benchmark_width_ + 1) / 2, \ - dst_pixels_opt, benchmark_width_ * 4, \ - benchmark_width_, benchmark_height_); \ - } \ - /* Test C and SIMD match. */ \ - for (int i = 0; i < kPixels * 4; ++i) { \ - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ - } \ - /* Test SIMD is close to original. */ \ - for (int i = 0; i < kPixels * 4; ++i) { \ - EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \ - static_cast<int>(dst_pixels_opt[i]), DIFF); \ - } \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \ + temp_v, (benchmark_width_ + 1) / 2, dst_pixels_opt, \ + benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ + } \ + /* Test C and SIMD match. */ \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + /* Test SIMD is close to original. */ \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_NEAR(static_cast<int>(orig_pixels[i]), \ + static_cast<int>(dst_pixels_opt[i]), DIFF); \ + } \ \ - free_aligned_buffer_page_end(orig_pixels); \ - free_aligned_buffer_page_end(orig_y); \ - free_aligned_buffer_page_end(orig_u); \ - free_aligned_buffer_page_end(orig_v); \ - free_aligned_buffer_page_end(temp_y); \ - free_aligned_buffer_page_end(temp_u); \ - free_aligned_buffer_page_end(temp_v); \ - free_aligned_buffer_page_end(dst_pixels_opt); \ - free_aligned_buffer_page_end(dst_pixels_c); \ -} \ + free_aligned_buffer_page_end(orig_pixels); \ + free_aligned_buffer_page_end(orig_y); \ + free_aligned_buffer_page_end(orig_u); \ + free_aligned_buffer_page_end(orig_v); \ + free_aligned_buffer_page_end(temp_y); \ + free_aligned_buffer_page_end(temp_u); \ + free_aligned_buffer_page_end(temp_v); \ + free_aligned_buffer_page_end(dst_pixels_opt); \ + free_aligned_buffer_page_end(dst_pixels_c); \ + } TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL) TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL) @@ -163,11 +156,8 @@ static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) { memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ - I422ToARGB(orig_y, kWidth, - orig_u, (kWidth + 1) / 2, - orig_v, (kWidth + 1) / 2, - orig_pixels, kWidth * 4, - kWidth, kHeight); + I422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, + orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; @@ -189,11 +179,8 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) { memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ - J422ToARGB(orig_y, kWidth, - orig_u, (kWidth + 1) / 2, - orig_v, (kWidth + 1) / 2, - orig_pixels, kWidth * 4, - kWidth, kHeight); + J422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, + orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; @@ -248,7 +235,7 @@ static void YJToRGB(int y, int* r, int* g, int* b) { #if defined(CLAMPMETHOD_IF) static int RoundToByte(float f) { - int i = ROUND(f); + int i = ROUND(f); if (i < 0) { i = 0; } @@ -259,52 +246,61 @@ static int RoundToByte(float f) { } #elif defined(CLAMPMETHOD_TABLE) static const unsigned char clamptable[811] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, - 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, - 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, - 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, - 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, - 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, - 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, - 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, - 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, - 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, - 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, - 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, - 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 -}; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, + 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, + 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, + 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, + 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, + 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, + 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, + 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, + 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; static int RoundToByte(float f) { return clamptable[ROUND(f) + 276]; @@ -317,7 +313,7 @@ static int RoundToByte(float f) { #elif defined(CLAMPMETHOD_MASK) static int RoundToByte(float f) { int i = ROUND(f); - i = ((-(i) >> 31) & (i)); // clamp to 0. + i = ((-(i) >> 31) & (i)); // clamp to 0. return (((255 - (i)) >> 31) | (i)) & 255; // clamp to 255. } #endif @@ -433,7 +429,6 @@ TEST_F(LibYUVColorTest, TestGreyYUV) { EXPECT_EQ(130, g1); EXPECT_EQ(130, b1); - for (int y = 0; y < 256; ++y) { YUVToRGBReference(y, 128, 128, &r0, &g0, &b0); YUVToRGB(y, 128, 128, &r1, &g1, &b1); @@ -477,7 +472,17 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) { } TEST_F(LibYUVColorTest, TestFullYUV) { - int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, }; + int rh[256] = + { + 0, + }, + gh[256] = + { + 0, + }, + bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; ++y2) { @@ -498,7 +503,17 @@ TEST_F(LibYUVColorTest, TestFullYUV) { } TEST_F(LibYUVColorTest, TestFullYUVJ) { - int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, }; + int rh[256] = + { + 0, + }, + gh[256] = + { + 0, + }, + bh[256] = { + 0, + }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; ++y2) { diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc index a8ce671d..13f74705 100644 --- a/files/unit_test/compare_test.cc +++ b/files/unit_test/compare_test.cc @@ -36,7 +36,8 @@ TEST_F(LibYUVBaseTest, Djb2_Test) { align_buffer_page_end(src_a, kMaxTest); align_buffer_page_end(src_b, kMaxTest); - const char* fox = "The quick brown fox jumps over the lazy dog" + const char* fox = + "The quick brown fox jumps over the lazy dog" " and feels as if he were in the seventh heaven of typography" " together with Hermann Zapf"; uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381); @@ -155,21 +156,21 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) { } src_a[0] = 0; - fourcc = ARGBDetect(src_a, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); - EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc); + fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); + EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc); src_a[0] = 255; src_a[3] = 0; - fourcc = ARGBDetect(src_a, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); - EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc); + fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); + EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc); src_a[3] = 255; for (int i = 0; i < benchmark_iterations_; ++i) { - fourcc = ARGBDetect(src_a, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); } - EXPECT_EQ(0, fourcc); + EXPECT_EQ(0u, fourcc); free_aligned_buffer_page_end(src_a); } @@ -183,21 +184,21 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) { } src_a[0 + 1] = 0; - fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); - EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc); + fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); + EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc); src_a[0 + 1] = 255; src_a[3 + 1] = 0; - fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); - EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc); + fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); + EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc); src_a[3 + 1] = 255; for (int i = 0; i < benchmark_iterations_; ++i) { - fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, + benchmark_height_); } - EXPECT_EQ(0, fourcc); + EXPECT_EQ(0u, fourcc); free_aligned_buffer_page_end(src_a); } @@ -220,13 +221,14 @@ TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) { memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); - int count = benchmark_iterations_ * - ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); + int count = + benchmark_iterations_ * + ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); } - EXPECT_EQ(0, h1); + EXPECT_EQ(0u, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); @@ -242,18 +244,18 @@ TEST_F(LibYUVBaseTest, SumSquareError) { uint64 err; err = ComputeSumSquareError(src_a, src_b, kMaxWidth); - EXPECT_EQ(0, err); + EXPECT_EQ(0u, err); memset(src_a, 1, kMaxWidth); err = ComputeSumSquareError(src_a, src_b, kMaxWidth); - EXPECT_EQ(err, kMaxWidth); + EXPECT_EQ(static_cast<int>(err), kMaxWidth); memset(src_a, 190, kMaxWidth); memset(src_b, 193, kMaxWidth); err = ComputeSumSquareError(src_a, src_b, kMaxWidth); - EXPECT_EQ(kMaxWidth * 3 * 3, err); + EXPECT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3); for (int i = 0; i < kMaxWidth; ++i) { src_a[i] = (fastrand() & 0xff); @@ -284,8 +286,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) { double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) - CalcFramePsnr(src_a, benchmark_width_, - src_b, benchmark_width_, + CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); opt_time = (get_time() - opt_time) / benchmark_iterations_; @@ -309,8 +310,7 @@ TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) { double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) - CalcFramePsnr(src_a + 1, benchmark_width_, - src_b, benchmark_width_, + CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); opt_time = (get_time() - opt_time) / benchmark_iterations_; @@ -335,24 +335,24 @@ TEST_F(LibYUVBaseTest, Psnr) { double err; err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); EXPECT_EQ(err, kMaxPsnr); memset(src_a, 255, kSrcPlaneSize); err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); EXPECT_EQ(err, 0.0); memset(src_a, 1, kSrcPlaneSize); err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); EXPECT_GT(err, 48.0); EXPECT_LT(err, 49.0); @@ -362,8 +362,8 @@ TEST_F(LibYUVBaseTest, Psnr) { } err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); EXPECT_GT(err, 2.0); if (kSrcWidth * kSrcHeight >= 256) { @@ -384,14 +384,14 @@ TEST_F(LibYUVBaseTest, Psnr) { double c_err, opt_err; c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); MaskCpuFlags(benchmark_cpu_info_); opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); EXPECT_EQ(opt_err, c_err); @@ -411,8 +411,7 @@ TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) { double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) - CalcFrameSsim(src_a, benchmark_width_, - src_b, benchmark_width_, + CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); opt_time = (get_time() - opt_time) / benchmark_iterations_; @@ -435,14 +434,14 @@ TEST_F(LibYUVBaseTest, Ssim) { memset(src_a, 0, kSrcPlaneSize); memset(src_b, 0, kSrcPlaneSize); - if (kSrcWidth <=8 || kSrcHeight <= 8) { + if (kSrcWidth <= 8 || kSrcHeight <= 8) { printf("warning - Ssim size too small. Testing function executes.\n"); } double err; err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_EQ(err, 1.0); @@ -451,8 +450,8 @@ TEST_F(LibYUVBaseTest, Ssim) { memset(src_a, 255, kSrcPlaneSize); err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_LT(err, 0.0001); @@ -461,8 +460,8 @@ TEST_F(LibYUVBaseTest, Ssim) { memset(src_a, 1, kSrcPlaneSize); err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_GT(err, 0.0001); @@ -474,8 +473,8 @@ TEST_F(LibYUVBaseTest, Ssim) { } err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_GT(err, 0.0); @@ -493,14 +492,14 @@ TEST_F(LibYUVBaseTest, Ssim) { double c_err, opt_err; c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); MaskCpuFlags(benchmark_cpu_info_); opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, - src_b + kSrcStride * b + b, kSrcStride, - kSrcWidth, kSrcHeight); + src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, + kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_EQ(opt_err, c_err); diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc index 56a2bfd8..0f1c7430 100644 --- a/files/unit_test/convert_test.cc +++ b/files/unit_test/convert_test.cc @@ -21,470 +21,546 @@ #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif +#include "../unit_test/unit_test.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/video_common.h" -#include "../unit_test/unit_test.h" namespace libyuv { -#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) - -#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ - align_buffer_page_end(src_v, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ - align_buffer_page_end(dst_y_c, kWidth * kHeight); \ - align_buffer_page_end(dst_u_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ - align_buffer_page_end(dst_u_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth * kHeight); \ - memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth * kHeight); \ - memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_u + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_c, kWidth, \ - dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_u + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_opt, kWidth, \ - dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_EQ(0, max_diff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_u_c[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>(dst_u_opt[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 3); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_v_c[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>(dst_v_opt[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 3); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ -} +#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) + +#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_u_c, 2, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 3, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_u_opt, 102, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 103, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ + dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_EQ(0, max_diff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + } -#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ - TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 1) \ - TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ - TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) +#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) TESTPLANARTOP(I420, 2, 2, I420, 2, 2) TESTPLANARTOP(I422, 2, 1, I420, 2, 2) TESTPLANARTOP(I444, 1, 1, I420, 2, 2) -TESTPLANARTOP(I411, 4, 1, I420, 2, 2) TESTPLANARTOP(I420, 2, 2, I422, 2, 1) TESTPLANARTOP(I420, 2, 2, I444, 1, 1) -TESTPLANARTOP(I420, 2, 2, I411, 4, 1) TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2) TESTPLANARTOP(I422, 2, 1, I422, 2, 1) TESTPLANARTOP(I444, 1, 1, I444, 1, 1) -#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ - align_buffer_page_end(src_v, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ - align_buffer_page_end(dst_y_c, kWidth * kHeight); \ - align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ - align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth * kHeight); \ - memset(dst_uv_c, 2, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth * kHeight); \ - memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_u + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_c, kWidth, \ - dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_u + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_opt, kWidth, \ - dst_uv_opt, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 1); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_uv_c[i * \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ - static_cast<int>(dst_uv_opt[i * \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 1); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ -} +// Test Android 420 to I420 +#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kSizeUV = \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_uv, \ + kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + uint8* src_u = src_uv + OFF_U; \ + uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \ + int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ + (fastrand() & 0xff); \ + src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_u_c, 2, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 3, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_u_opt, 102, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 103, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \ + kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \ + dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_EQ(0, max_diff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + } + +#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \ + SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \ + SUBSAMP_Y) \ + TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, \ + _Any, +, 0, PN, OFF_U, OFF_V) \ + TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \ + _Unaligned, +, 1, PN, OFF_U, OFF_V) \ + TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \ + -, 0, PN, OFF_U, OFF_V) \ + TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \ + 0, PN, OFF_U, OFF_V) + +TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2) +TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2) +TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2) + +#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_uv_c, 2, \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_uv_opt, 102, \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ + dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ + dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>( \ + dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + } -#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 1) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) +#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2) TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ - align_buffer_page_end(dst_y_c, kWidth * kHeight); \ - align_buffer_page_end(dst_u_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ - align_buffer_page_end(dst_u_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ + DOY) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + } \ } \ - } \ - memset(dst_y_c, 1, kWidth * kHeight); \ - memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth * kHeight); \ - memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_c, kWidth, \ - dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ - src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - dst_y_opt, kWidth, \ - dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ - dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ - kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_u_c, 2, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 3, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_u_opt, 102, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 103, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_uv + OFF, \ + 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_uv + OFF, \ + 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \ + kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + if (DOY) { \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ } \ + EXPECT_LE(max_diff, 1); \ } \ - } \ - EXPECT_LE(max_diff, 1); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_u_c[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>(dst_u_opt[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ } \ } \ - } \ - EXPECT_LE(max_diff, 1); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_v_c[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast<int>(dst_v_opt[i * \ - SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>( \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ } \ } \ - } \ - EXPECT_LE(max_diff, 1); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ -} + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + } -#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 1) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) +#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ + 1) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2) TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2) -#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN)) +#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN)) #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, kSizeUV + OFF); \ - align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y[i + OFF] = (fastrand() & 0xff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - src_u[i + OFF] = (fastrand() & 0xff); \ - src_v[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_c + OFF, kStrideB, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_opt + OFF, kStrideB, \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, kSizeUV + OFF); \ + align_buffer_page_end(src_v, kSizeUV + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (fastrand() & 0xff); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (fastrand() & 0xff); \ + src_v[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \ kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight); \ - align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ - FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \ - dst_argb32_c, kWidth * BPP_C , \ - kWidth, kHeight); \ - FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \ - dst_argb32_opt, kWidth * BPP_C , \ - kWidth, kHeight); \ - for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb32_c[i]) - \ - static_cast<int>(dst_argb32_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_opt + OFF, \ + kStrideB, kWidth, NEG kHeight); \ } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - free_aligned_buffer_page_end(dst_argb32_c); \ - free_aligned_buffer_page_end(dst_argb32_opt); \ -} + int max_diff = 0; \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \ + align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \ + memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \ + memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \ + FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \ + kWidth, kHeight); \ + FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \ + kWidth * BPP_C, kWidth, kHeight); \ + for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \ + static_cast<int>(dst_argb32_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + free_aligned_buffer_page_end(dst_argb32_c); \ + free_aligned_buffer_page_end(dst_argb32_opt); \ + } #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, DIFF, FMT_C, BPP_C) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \ + BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) @@ -507,7 +583,6 @@ TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4) -TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4) @@ -519,247 +594,275 @@ TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4) #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, kSizeUV + OFF); \ - align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(src_a, kWidth * kHeight + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y[i + OFF] = (fastrand() & 0xff); \ - src_a[i + OFF] = (fastrand() & 0xff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - src_u[i + OFF] = (fastrand() & 0xff); \ - src_v[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - src_a + OFF, kWidth, \ - dst_argb_c + OFF, kStrideB, \ - kWidth, NEG kHeight, ATTEN); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - src_a + OFF, kWidth, \ - dst_argb_opt + OFF, kStrideB, \ - kWidth, NEG kHeight, ATTEN); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb_c[i + OFF]) - \ - static_cast<int>(dst_argb_opt[i + OFF])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, kSizeUV + OFF); \ + align_buffer_page_end(src_v, kSizeUV + OFF); \ + align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (fastrand() & 0xff); \ + src_a[i + OFF] = (fastrand() & 0xff); \ } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(src_a); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ -} + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (fastrand() & 0xff); \ + src_v[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, src_a + OFF, kWidth, \ + dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \ + ATTEN); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, src_a + OFF, kWidth, \ + dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \ + ATTEN); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) - \ + static_cast<int>(dst_argb_opt[i + OFF])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(src_a); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } -#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF) \ - TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \ - TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \ - TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \ - TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \ - TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1) +#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, DIFF) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ W1280, DIFF, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kStrideB = kWidth * BPP_B; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_uv, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV * 2; ++j) { \ - src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStrideB = kWidth * BPP_B; \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_uv, \ + kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < kStrideUV * 2; ++j) { \ + src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \ + } \ } \ - } \ - memset(dst_argb_c, 1, kStrideB * kHeight); \ - memset(dst_argb_opt, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_uv + OFF, kStrideUV * 2, \ - dst_argb_c, kWidth * BPP_B, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_uv + OFF, kStrideUV * 2, \ - dst_argb_opt, kWidth * BPP_B, \ - kWidth, NEG kHeight); \ - } \ - /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \ - align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ - memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ - memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ - FMT_B##ToARGB(dst_argb_c, kStrideB, \ - dst_argb32_c, kWidth * 4, \ - kWidth, kHeight); \ - FMT_B##ToARGB(dst_argb_opt, kStrideB, \ - dst_argb32_opt, kWidth * 4, \ - kWidth, kHeight); \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth * 4; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \ - static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + memset(dst_argb_c, 1, kStrideB* kHeight); \ + memset(dst_argb_opt, 101, kStrideB* kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ + dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ + dst_argb_opt, kWidth * BPP_B, kWidth, \ + NEG kHeight); \ + } \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \ + align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ + memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ + memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ + FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ + kHeight); \ + FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ + kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * 4; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \ + static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ } \ } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - free_aligned_buffer_page_end(dst_argb32_c); \ - free_aligned_buffer_page_end(dst_argb32_opt); \ -} + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + free_aligned_buffer_page_end(dst_argb32_c); \ + free_aligned_buffer_page_end(dst_argb32_opt); \ + } -#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - benchmark_width_, DIFF, _Invert, -, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - benchmark_width_, DIFF, _Opt, +, 0) +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Opt, +, 0) TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2) TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2) TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9) +#ifdef DO_THREE_PLANES +// Do 3 allocations for yuv. conventional but slower. +#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, DIFF, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ + align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_u_opt, \ + kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, \ + kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c, \ + kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ + dst_u_opt, kStrideUV, dst_v_opt, kStrideUV, \ + kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \ + static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \ + } \ + } \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < kStrideUV; ++j) { \ + EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \ + static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \ + } \ + } \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < kStrideUV; ++j) { \ + EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]), \ + static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF); \ + } \ + } \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_argb); \ + } +#else #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, DIFF, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kStride = \ - (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ - align_buffer_page_end(src_argb, kStride * kHeight + OFF); \ - align_buffer_page_end(dst_y_c, kWidth * kHeight); \ - align_buffer_page_end(dst_u_c, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_c, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ - align_buffer_page_end(dst_u_opt, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_opt, \ - kStrideUV * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_c, 1, kWidth * kHeight); \ - memset(dst_u_c, 2, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth * kHeight); \ - memset(dst_u_opt, 102, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, \ - kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kStride; ++j) \ - src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ - dst_y_c, kWidth, \ - dst_u_c, kStrideUV, \ - dst_v_c, kStrideUV, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ - dst_y_opt, kWidth, \ - dst_u_opt, kStrideUV, \ - dst_v_opt, kStrideUV, \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ + align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ + kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \ kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \ - static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ + dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \ + kStrideUV * 2, kWidth, NEG kHeight); \ } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]), \ - static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF); \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]), \ + static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF); \ + } \ } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV; ++j) { \ - EXPECT_NEAR(static_cast<int>(dst_v_c[i * \ - kStrideUV + j]), \ - static_cast<int>(dst_v_opt[i * \ - kStrideUV + j]), DIFF); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \ + for (int j = 0; j < kStrideUV; ++j) { \ + EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]), \ + static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF); \ + } \ } \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_argb); \ -} + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_argb); \ + } +#endif -#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - DIFF) \ - TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Invert, -, 0) \ - TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, DIFF, _Opt, +, 0) +#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + DIFF) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Opt, +, 0) TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) -#if defined(__arm__) || defined (__aarch64__) +#if defined(__arm__) || defined(__aarch64__) // arm version subsamples by summing 4 pixels then multiplying by matrix with // 4x smaller coefficients which are rounded to nearest integer. TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) @@ -777,7 +880,6 @@ TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) // TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9. TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15) TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17) -TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4) TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2) TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2) TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2) @@ -787,183 +889,173 @@ TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2) TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) -#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - align_buffer_page_end(src_argb, kStride * kHeight + OFF); \ - align_buffer_page_end(dst_y_c, kWidth * kHeight); \ - align_buffer_page_end(dst_uv_c, \ - kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ - align_buffer_page_end(dst_uv_opt, \ - kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kStride; ++j) \ - src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ - memset(dst_y_c, 1, kWidth * kHeight); \ - memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth * kHeight); \ - memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ - dst_y_c, kWidth, dst_uv_c, kStrideUV * 2, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ - dst_y_opt, kWidth, \ - dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ - static_cast<int>(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 4); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV * 2; ++j) { \ - int abs_diff = \ - abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \ - static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 4); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - free_aligned_buffer_page_end(src_argb); \ -} +#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ + SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ + kStrideUV * 2, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ + dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 4); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < kStrideUV * 2; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) - \ + static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 4); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_argb); \ + } #define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 1) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) + TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) -#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - W1280, DIFF, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 1, kStrideB * kHeightB); \ - memset(dst_argb_opt, 101, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \ - dst_argb_c, kStrideB, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \ - dst_argb_opt, kStrideB, \ - kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ -} +#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, W1280, DIFF, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideB* kHeightB); \ + memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } -#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ - for (int times = 0; times < benchmark_iterations_; ++times) { \ - const int kWidth = (fastrand() & 63) + 1; \ - const int kHeight = (fastrand() & 31) + 1; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\ - const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\ - align_buffer_page_end(src_argb, kStrideA * kHeightA); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 123, kStrideB * kHeightB); \ - memset(dst_argb_opt, 123, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, \ - dst_argb_c, kStrideB, \ - kWidth, kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, \ - dst_argb_opt, kStrideB, \ - kWidth, kHeight); \ - int max_diff = 0; \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } \ -} +#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ + STRIDE_B, HEIGHT_B, DIFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (fastrand() & 63) + 1; \ + const int kHeight = (fastrand() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 123, kStrideB* kHeightB); \ + memset(dst_argb_opt, 123, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \ + kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \ + kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } \ + } -#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Invert, -, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Opt, +, 0) \ - TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) +#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, DIFF) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \ + TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, DIFF) TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0) @@ -989,6 +1081,7 @@ TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0) TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0) TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4) TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4) +TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0) TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0) TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0) @@ -996,159 +1089,146 @@ TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0) TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) -#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - W1280, DIFF, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 1, kStrideB * kHeightB); \ - memset(dst_argb_opt, 101, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \ - dst_argb_c, kStrideB, \ - NULL, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \ - dst_argb_opt, kStrideB, \ - NULL, kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ +#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, W1280, DIFF, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideB* kHeightB); \ + memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \ + NULL, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \ + kStrideB, NULL, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ + STRIDE_B, HEIGHT_B, DIFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (fastrand() & 63) + 1; \ + const int kHeight = (fastrand() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 123, kStrideB* kHeightB); \ + memset(dst_argb_opt, 123, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \ + kWidth, kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \ + NULL, kWidth, kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ -} + } + +#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, DIFF) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0) \ + TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ + HEIGHT_B, DIFF) + +TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) -#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ -TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \ - for (int times = 0; times < benchmark_iterations_; ++times) { \ - const int kWidth = (fastrand() & 63) + 1; \ - const int kHeight = (fastrand() & 31) + 1; \ +#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\ - const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\ - align_buffer_page_end(src_argb, kStrideA * kHeightA); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \ + align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i] = (fastrand() & 0xff); \ + src_argb[i + OFF] = (fastrand() & 0xff); \ } \ - memset(dst_argb_c, 123, kStrideB * kHeightB); \ - memset(dst_argb_opt, 123, kStrideB * kHeightB); \ + memset(dst_argb_c, 1, kStrideA* kHeightA); \ + memset(dst_argb_opt, 101, kStrideA* kHeightA); \ MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \ - dst_argb_c, kStrideB, \ - NULL, kWidth, kHeight); \ + FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \ + NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ - FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \ - dst_argb_opt, kStrideB, \ - NULL, kWidth, kHeight); \ - int max_diff = 0; \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb_c[i]) - \ - static_cast<int>(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \ + NEG kHeight); \ + } \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \ + NEG kHeight); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ - EXPECT_LE(max_diff, DIFF); \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ - } \ -} - -#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ - TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Invert, -, 0) \ - TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ - benchmark_width_, DIFF, _Opt, +, 0) \ - TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ - FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) - -TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) - -#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ - W1280, N, NEG, OFF) \ -TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideA * kHeightA); \ - align_buffer_page_end(dst_argb_opt, kStrideA * kHeightA); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 1, kStrideA * kHeightA); \ - memset(dst_argb_opt, 101, kStrideA * kHeightA); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(src_argb + OFF, kStrideA, \ - dst_argb_c, kStrideA, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_ATOB(src_argb + OFF, kStrideA, \ - dst_argb_opt, kStrideA, \ - kWidth, NEG kHeight); \ - } \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(dst_argb_c, kStrideA, \ - dst_argb_c, kStrideA, \ - kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - FMT_ATOB(dst_argb_opt, kStrideA, \ - dst_argb_opt, kStrideA, \ - kWidth, NEG kHeight); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ -} + } #define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ - benchmark_width_ - 4, _Any, +, 0) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ - benchmark_width_, _Unaligned, +, 1) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ - benchmark_width_, _Opt, +, 0) + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \ + 0) \ + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \ + +, 1) \ + TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0) TESTSYM(ARGBToARGB, 4, 4, 1) TESTSYM(ARGBToBGRA, 4, 4, 1) @@ -1174,8 +1254,9 @@ TEST_F(LibYUVConvertTest, Test565) { TEST_F(LibYUVConvertTest, ValidateJpeg) { const int kOff = 10; const int kMinJpeg = 64; - const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? - benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg + ? benchmark_width_ * benchmark_height_ + : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); @@ -1201,8 +1282,9 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) { TEST_F(LibYUVConvertTest, ValidateJpegLarge) { const int kOff = 10; const int kMinJpeg = 64; - const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? - benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg + ? benchmark_width_ * benchmark_height_ + : kMinJpeg; const int kSize = kImageSize + kOff; const int kMultiple = 10; const int kBufSize = kImageSize * kMultiple + kOff; @@ -1226,8 +1308,9 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) { TEST_F(LibYUVConvertTest, InvalidateJpeg) { const int kOff = 10; const int kMinJpeg = 64; - const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? - benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg + ? benchmark_width_ * benchmark_height_ + : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); @@ -1280,17 +1363,16 @@ TEST_F(LibYUVConvertTest, FuzzJpeg) { TEST_F(LibYUVConvertTest, MJPGToI420) { const int kOff = 10; const int kMinJpeg = 64; - const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? - benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg + ? benchmark_width_ * benchmark_height_ + : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_); - align_buffer_page_end(dst_u_opt, - SUBSAMPLE(benchmark_width_, 2) * - SUBSAMPLE(benchmark_height_, 2)); - align_buffer_page_end(dst_v_opt, - SUBSAMPLE(benchmark_width_, 2) * - SUBSAMPLE(benchmark_height_, 2)); + align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) * + SUBSAMPLE(benchmark_height_, 2)); + align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) * + SUBSAMPLE(benchmark_height_, 2)); // EOI, SOI to make MJPG appear valid. memset(orig_pixels, 0, kSize); @@ -1300,12 +1382,11 @@ TEST_F(LibYUVConvertTest, MJPGToI420) { orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. for (int times = 0; times < benchmark_iterations_; ++times) { - int ret = MJPGToI420(orig_pixels, kSize, - dst_y_opt, benchmark_width_, - dst_u_opt, SUBSAMPLE(benchmark_width_, 2), - dst_v_opt, SUBSAMPLE(benchmark_width_, 2), - benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_); + int ret = + MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt, + SUBSAMPLE(benchmark_width_, 2), dst_v_opt, + SUBSAMPLE(benchmark_width_, 2), benchmark_width_, + benchmark_height_, benchmark_width_, benchmark_height_); // Expect failure because image is not really valid. EXPECT_EQ(1, ret); } @@ -1319,8 +1400,9 @@ TEST_F(LibYUVConvertTest, MJPGToI420) { TEST_F(LibYUVConvertTest, MJPGToARGB) { const int kOff = 10; const int kMinJpeg = 64; - const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? - benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg + ? benchmark_width_ * benchmark_height_ + : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4); @@ -1333,10 +1415,9 @@ TEST_F(LibYUVConvertTest, MJPGToARGB) { orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. for (int times = 0; times < benchmark_iterations_; ++times) { - int ret = MJPGToARGB(orig_pixels, kSize, - dst_argb_opt, benchmark_width_ * 4, - benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_); + int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4, + benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_); // Expect failure because image is not really valid. EXPECT_EQ(1, ret); } @@ -1353,66 +1434,53 @@ TEST_F(LibYUVConvertTest, NV12Crop) { const int kWidth = benchmark_width_; const int kHeight = benchmark_height_; const int crop_y = - ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1; + ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1; const int kDestWidth = benchmark_width_; const int kDestHeight = benchmark_height_ - crop_y * 2; const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); - const int sample_size = kWidth * kHeight + - kStrideUV * - SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; + const int sample_size = + kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; align_buffer_page_end(src_y, sample_size); uint8* src_uv = src_y + kWidth * kHeight; align_buffer_page_end(dst_y, kDestWidth * kDestHeight); - align_buffer_page_end(dst_u, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - align_buffer_page_end(dst_v, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight); - align_buffer_page_end(dst_u_2, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - align_buffer_page_end(dst_v_2, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); for (int i = 0; i < kHeight * kWidth; ++i) { src_y[i] = (fastrand() & 0xff); } - for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * - kStrideUV) * 2; ++i) { + for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) { src_uv[i] = (fastrand() & 0xff); } memset(dst_y, 1, kDestWidth * kDestHeight); - memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + memset(dst_u, 2, + SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + memset(dst_v, 3, + SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); memset(dst_y_2, 1, kDestWidth * kDestHeight); - memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * - SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - - ConvertToI420(src_y, sample_size, - dst_y_2, kDestWidth, - dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X), - dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X), - 0, crop_y, - kWidth, kHeight, - kDestWidth, kDestHeight, - libyuv::kRotate0, libyuv::FOURCC_NV12); + memset(dst_u_2, 2, + SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + memset(dst_v_2, 3, + SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + + ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2, + SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2, + SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight, + kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12); NV12ToI420(src_y + crop_y * kWidth, kWidth, - src_uv + (crop_y / 2) * kStrideUV * 2, - kStrideUV * 2, - dst_y, kDestWidth, - dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), - dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X), - kDestWidth, kDestHeight); + src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y, + kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v, + SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight); for (int i = 0; i < kDestHeight; ++i) { for (int j = 0; j < kDestWidth; ++j) { @@ -1452,10 +1520,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) { for (int i = 0; i < 32; ++i) { printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i], - argb[i * 4 + 0], - argb[i * 4 + 1], - argb[i * 4 + 2], - argb[i * 4 + 3]); + argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]); } for (int i = 0; i < 32; ++i) { EXPECT_EQ(expectedg[i], argb[i * 4 + 0]); @@ -1463,10 +1528,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) { } static const uint8 kNoDither4x4[16] = { - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; TEST_F(LibYUVConvertTest, TestNoDither) { @@ -1477,12 +1539,11 @@ TEST_F(LibYUVConvertTest, TestNoDither) { MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4); MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); - ARGBToRGB565(src_argb, benchmark_width_ * 4, - dst_rgb565, benchmark_width_ * 2, + ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2, benchmark_width_, benchmark_height_); - ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, - dst_rgb565dither, benchmark_width_ * 2, - kNoDither4x4, benchmark_width_, benchmark_height_); + ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither, + benchmark_width_ * 2, kNoDither4x4, benchmark_width_, + benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]); } @@ -1494,10 +1555,7 @@ TEST_F(LibYUVConvertTest, TestNoDither) { // Ordered 4x4 dither for 888 to 565. Values from 0 to 7. static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; TEST_F(LibYUVConvertTest, TestDither) { @@ -1513,18 +1571,15 @@ TEST_F(LibYUVConvertTest, TestDither) { MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4); MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4); - ARGBToRGB565(src_argb, benchmark_width_ * 4, - dst_rgb565, benchmark_width_ * 2, + ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2, benchmark_width_, benchmark_height_); - ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, - dst_rgb565dither, benchmark_width_ * 2, - kDither565_4x4, benchmark_width_, benchmark_height_); - RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, - dst_argb, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); - RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, - dst_argbdither, benchmark_width_ * 4, + ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither, + benchmark_width_ * 2, kDither565_4x4, benchmark_width_, + benchmark_height_); + RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4, benchmark_width_, benchmark_height_); + RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) { EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9); @@ -1537,218 +1592,197 @@ TEST_F(LibYUVConvertTest, TestDither) { } #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, kSizeUV + OFF); \ - align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y[i + OFF] = (fastrand() & 0xff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - src_u[i + OFF] = (fastrand() & 0xff); \ - src_v[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_c + OFF, kStrideB, \ - NULL, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_opt + OFF, kStrideB, \ - NULL, kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight); \ - align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ - FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \ - dst_argb32_c, kWidth * BPP_C , \ - kWidth, kHeight); \ - FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \ - dst_argb32_opt, kWidth * BPP_C , \ - kWidth, kHeight); \ - for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ - int abs_diff = \ - abs(static_cast<int>(dst_argb32_c[i]) - \ - static_cast<int>(dst_argb32_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ + YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, kSizeUV + OFF); \ + align_buffer_page_end(src_v, kSizeUV + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (fastrand() & 0xff); \ } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - free_aligned_buffer_page_end(dst_argb32_c); \ - free_aligned_buffer_page_end(dst_argb32_opt); \ -} + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (fastrand() & 0xff); \ + src_v[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_c + OFF, \ + kStrideB, NULL, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B##Dither( \ + src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \ + dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \ + align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \ + memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \ + memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \ + FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \ + kWidth, kHeight); \ + FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \ + kWidth * BPP_C, kWidth, kHeight); \ + for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ + int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \ + static_cast<int>(dst_argb32_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + free_aligned_buffer_page_end(dst_argb32_c); \ + free_aligned_buffer_page_end(dst_argb32_opt); \ + } #define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ - TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) + YALIGN, DIFF, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, \ + BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \ + BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ + TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4) -#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \ -TEST_F(LibYUVConvertTest, NAME) { \ - const int kWidth = benchmark_width_; \ - const int kHeight = benchmark_height_; \ - \ - align_buffer_page_end(orig_uyvy, \ - 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ - align_buffer_page_end(orig_y, kWidth * kHeight); \ - align_buffer_page_end(orig_u, \ - SUBSAMPLE(kWidth, 2) * \ - SUBSAMPLE(kHeight, 2)); \ - align_buffer_page_end(orig_v, \ - SUBSAMPLE(kWidth, 2) * \ - SUBSAMPLE(kHeight, 2)); \ - \ - align_buffer_page_end(dst_y_orig, kWidth * kHeight); \ - align_buffer_page_end(dst_uv_orig, 2 * \ - SUBSAMPLE(kWidth, 2) * \ - SUBSAMPLE(kHeight, 2)); \ - \ - align_buffer_page_end(dst_y, kWidth * kHeight); \ - align_buffer_page_end(dst_uv, 2 * \ - SUBSAMPLE(kWidth, 2) * \ - SUBSAMPLE(kHeight, 2)); \ - \ - MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ - \ - /* Convert UYVY to NV12 in 2 steps for reference */ \ - libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), \ - orig_y, kWidth, \ - orig_u, SUBSAMPLE(kWidth, 2), \ - orig_v, SUBSAMPLE(kWidth, 2), \ - kWidth, kHeight); \ - libyuv::I420ToNV12(orig_y, kWidth, \ - orig_u, SUBSAMPLE(kWidth, 2), \ - orig_v, SUBSAMPLE(kWidth, 2), \ - dst_y_orig, kWidth, \ - dst_uv_orig, 2 * SUBSAMPLE(kWidth, 2), \ - kWidth, kHeight); \ - \ - /* Convert to NV12 */ \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), \ - dst_y, kWidth, \ - dst_uv, 2 * SUBSAMPLE(kWidth, 2), \ - kWidth, kHeight); \ - } \ - \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - EXPECT_EQ(orig_y[i], dst_y[i]); \ - } \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - EXPECT_EQ(dst_y_orig[i], dst_y[i]); \ - } \ - for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); ++i) { \ - EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \ - } \ - \ - free_aligned_buffer_page_end(orig_uyvy); \ - free_aligned_buffer_page_end(orig_y); \ - free_aligned_buffer_page_end(orig_u); \ - free_aligned_buffer_page_end(orig_v); \ - free_aligned_buffer_page_end(dst_y_orig); \ - free_aligned_buffer_page_end(dst_uv_orig); \ - free_aligned_buffer_page_end(dst_y); \ - free_aligned_buffer_page_end(dst_uv); \ -} +#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \ + TEST_F(LibYUVConvertTest, NAME) { \ + const int kWidth = benchmark_width_; \ + const int kHeight = benchmark_height_; \ + \ + align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ + align_buffer_page_end(orig_y, kWidth* kHeight); \ + align_buffer_page_end(orig_u, \ + SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ + align_buffer_page_end(orig_v, \ + SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ + \ + align_buffer_page_end(dst_y_orig, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_orig, \ + 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ + \ + align_buffer_page_end(dst_y, kWidth* kHeight); \ + align_buffer_page_end(dst_uv, \ + 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ + \ + MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ + \ + /* Convert UYVY to NV12 in 2 steps for reference */ \ + libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \ + orig_u, SUBSAMPLE(kWidth, 2), orig_v, \ + SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ + libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \ + SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \ + 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ + \ + /* Convert to NV12 */ \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \ + dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ + } \ + \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + EXPECT_EQ(orig_y[i], dst_y[i]); \ + } \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + EXPECT_EQ(dst_y_orig[i], dst_y[i]); \ + } \ + for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \ + ++i) { \ + EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \ + } \ + \ + free_aligned_buffer_page_end(orig_uyvy); \ + free_aligned_buffer_page_end(orig_y); \ + free_aligned_buffer_page_end(orig_u); \ + free_aligned_buffer_page_end(orig_v); \ + free_aligned_buffer_page_end(dst_y_orig); \ + free_aligned_buffer_page_end(dst_uv_orig); \ + free_aligned_buffer_page_end(dst_y); \ + free_aligned_buffer_page_end(dst_uv); \ + } TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12) TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) -#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - W1280, N, NEG, OFF, FMT_C, BPP_C) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, kSizeUV + OFF); \ - align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y[i + OFF] = (fastrand() & 0xff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - src_u[i + OFF] = (fastrand() & 0xff); \ - src_v[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_b + OFF, kStrideB, \ - kWidth, NEG kHeight); \ - } \ - /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ - const int kStrideC = kWidth * BPP_C; \ - align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \ - align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \ - memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ - memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ - FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, \ - src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, \ - dst_argb_c + OFF, kStrideC, \ - kWidth, NEG kHeight); \ - /* Convert B to C */ \ - FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, \ - dst_argb_bc + OFF, kStrideC, \ - kWidth, kHeight); \ - for (int i = 0; i < kStrideC * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_b); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_bc); \ -} +#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + W1280, N, NEG, OFF, FMT_C, BPP_C) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, kSizeUV + OFF); \ + align_buffer_page_end(src_v, kSizeUV + OFF); \ + align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (fastrand() & 0xff); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (fastrand() & 0xff); \ + src_v[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_b + OFF, \ + kStrideB, kWidth, NEG kHeight); \ + } \ + /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ + const int kStrideC = kWidth * BPP_C; \ + align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ + align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ + memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ + memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ + FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \ + kWidth, NEG kHeight); \ + /* Convert B to C */ \ + FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \ + kWidth, kHeight); \ + for (int i = 0; i < kStrideC * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_b); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_bc); \ + } -#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - FMT_C, BPP_C) \ - TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ - TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \ - TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \ - TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) +#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + FMT_C, BPP_C) \ + TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ + TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \ + TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \ + TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4) @@ -1774,7 +1808,6 @@ TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4) -TESTPLANARTOE(I411, 4, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4) @@ -1784,78 +1817,107 @@ TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) #define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \ -TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ - const int kSizeUV = \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ - align_buffer_page_end(src_u, kSizeUV + OFF); \ - align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(src_a, kWidth * kHeight + OFF); \ - align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y[i + OFF] = (fastrand() & 0xff); \ - src_a[i + OFF] = (fastrand() & 0xff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - src_u[i + OFF] = (fastrand() & 0xff); \ - src_v[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ - src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_a + OFF, kWidth, \ - dst_argb_b + OFF, kStrideB, \ - kWidth, NEG kHeight, ATTEN); \ - } \ - int max_diff = 0; \ - /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ - const int kStrideC = kWidth * BPP_C; \ - align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \ - align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \ - memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ - memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ - FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, \ - src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_a + OFF, kWidth, \ - dst_argb_c + OFF, kStrideC, \ - kWidth, NEG kHeight, ATTEN); \ - /* Convert B to C */ \ - FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, \ - dst_argb_bc + OFF, kStrideC, \ - kWidth, kHeight); \ - for (int i = 0; i < kStrideC * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(src_a); \ - free_aligned_buffer_page_end(dst_argb_b); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_bc); \ -} + W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ + const int kSizeUV = \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, kSizeUV + OFF); \ + align_buffer_page_end(src_v, kSizeUV + OFF); \ + align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ + align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (fastrand() & 0xff); \ + src_a[i + OFF] = (fastrand() & 0xff); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (fastrand() & 0xff); \ + src_v[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ + dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ + } \ + /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ + const int kStrideC = kWidth * BPP_C; \ + align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ + align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ + memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ + memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ + FMT_PLANAR##To##FMT_C( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ + dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \ + /* Convert B to C */ \ + FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \ + kWidth, kHeight); \ + for (int i = 0; i < kStrideC * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(src_a); \ + free_aligned_buffer_page_end(dst_argb_b); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_bc); \ + } -#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - FMT_C, BPP_C) \ - TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \ - TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \ - TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \ - TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \ - TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1) +#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + FMT_C, BPP_C) \ + TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \ + TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \ + TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \ + TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \ + TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ + benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1) TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) +TEST_F(LibYUVConvertTest, RotateWithARGBSource) { + // 2x2 frames + uint32_t src[4]; + uint32_t dst[4]; + // some random input + src[0] = 0x11000000; + src[1] = 0x00450000; + src[2] = 0x00009f00; + src[3] = 0x000000ff; + // zeros on destination + dst[0] = 0x00000000; + dst[1] = 0x00000000; + dst[2] = 0x00000000; + dst[3] = 0x00000000; + + int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src), + 16, // input size + reinterpret_cast<uint8_t*>(dst), + 8, // destination stride + 0, // crop_x + 0, // crop_y + 2, // width + 2, // height + 2, // crop width + 2, // crop height + kRotate90, FOURCC_ARGB); + + EXPECT_EQ(r, 0); + // 90 degrees rotation, no conversion + EXPECT_EQ(dst[0], src[2]); + EXPECT_EQ(dst[1], src[0]); + EXPECT_EQ(dst[2], src[3]); + EXPECT_EQ(dst[3], src[1]); +} + } // namespace libyuv diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc index 0cd06f9b..048ed31a 100644 --- a/files/unit_test/cpu_test.cc +++ b/files/unit_test/cpu_test.cc @@ -11,10 +11,10 @@ #include <stdlib.h> #include <string.h> +#include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/version.h" -#include "../unit_test/unit_test.h" namespace libyuv { @@ -45,10 +45,14 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { printf("Has FMA3 %x\n", has_fma3); int has_avx3 = TestCpuFlag(kCpuHasAVX3); printf("Has AVX3 %x\n", has_avx3); + int has_f16c = TestCpuFlag(kCpuHasF16C); + printf("Has F16C %x\n", has_f16c); int has_mips = TestCpuFlag(kCpuHasMIPS); printf("Has MIPS %x\n", has_mips); int has_dspr2 = TestCpuFlag(kCpuHasDSPR2); printf("Has DSPR2 %x\n", has_dspr2); + int has_msa = TestCpuFlag(kCpuHasMSA); + printf("Has MSA %x\n", has_msa); } TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) { @@ -62,19 +66,20 @@ TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) { printf("x64 build\n"); #endif #ifdef _MSC_VER -printf("_MSC_VER %d\n", _MSC_VER); + printf("_MSC_VER %d\n", _MSC_VER); #endif -#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(VISUALC_HAS_AVX2)) printf("Has AVX2 1\n"); #else printf("Has AVX2 0\n"); - // If compiler does not support AVX2, the following function not expected: +// If compiler does not support AVX2, the following function not expected: #endif } -#if defined(__i386__) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_X64) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) TEST_F(LibYUVBaseTest, TestCpuId) { int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { @@ -96,7 +101,7 @@ TEST_F(LibYUVBaseTest, TestCpuId) { cpu_info[3] = 0; printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1], cpu_info[2]); - EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0]))); + EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0]))); // CPU Family and Model // 3:0 - Stepping @@ -108,8 +113,8 @@ TEST_F(LibYUVBaseTest, TestCpuId) { CpuId(1, 0, cpu_info); int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); - printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, - model, model); + printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, + model); } } #endif diff --git a/files/unit_test/math_test.cc b/files/unit_test/math_test.cc index 19af9f6b..2b4b57b1 100644 --- a/files/unit_test/math_test.cc +++ b/files/unit_test/math_test.cc @@ -12,11 +12,11 @@ #include <string.h> #include <time.h> +#include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" #include "libyuv/scale_row.h" -#include "../unit_test/unit_test.h" namespace libyuv { diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc index bc0eebb5..2d53cc02 100644 --- a/files/unit_test/planar_test.cc +++ b/files/unit_test/planar_test.cc @@ -90,11 +90,11 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(0, atten_pixels[0 * 4 + 3]); EXPECT_EQ(64, atten_pixels[128 * 4 + 0]); EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); - EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); + EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1); EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1); - EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); + EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); @@ -103,9 +103,13 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { free_aligned_buffer_page_end(orig_pixels); } -static int TestAttenuateI(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestAttenuateI(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -121,20 +125,17 @@ static int TestAttenuateI(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBAttenuate(src_argb + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBAttenuate(src_argb + off, kStride, dst_argb_c, kStride, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBAttenuate(src_argb + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBAttenuate(src_argb + off, kStride, dst_argb_opt, kStride, width, + invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -147,39 +148,39 @@ static int TestAttenuateI(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) { - int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1); + int max_diff = + TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) { - int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0); + int max_diff = + TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) { - int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + int max_diff = + TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); } -static int TestUnattenuateI(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestUnattenuateI(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -191,27 +192,23 @@ static int TestUnattenuateI(int width, int height, int benchmark_iterations, for (int i = 0; i < kStride * height; ++i) { src_argb[i + off] = (fastrand() & 0xff); } - ARGBAttenuate(src_argb + off, kStride, - src_argb + off, kStride, - width, height); + ARGBAttenuate(src_argb + off, kStride, src_argb + off, kStride, width, + height); memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBUnattenuate(src_argb + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBUnattenuate(src_argb + off, kStride, dst_argb_c, kStride, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBUnattenuate(src_argb + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBUnattenuate(src_argb + off, kStride, dst_argb_opt, kStride, width, + invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -224,33 +221,29 @@ static int TestUnattenuateI(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 2); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); } @@ -268,8 +261,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) { } ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4, - &added_pixels[0][0][0], 16 * 4, - 16, 16); + &added_pixels[0][0][0], 16 * 4, 16, 16); for (int y = 0; y < 16; ++y) { for (int x = 0; x < 16; ++x) { @@ -503,10 +495,8 @@ TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) { // Matrix for Sepia. SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { - 17 / 2, 68 / 2, 35 / 2, 0, - 22 / 2, 88 / 2, 45 / 2, 0, - 24 / 2, 98 / 2, 50 / 2, 0, - 0, 0, 0, 64, // Copy alpha. + 17 / 2, 68 / 2, 35 / 2, 0, 22 / 2, 88 / 2, 45 / 2, 0, + 24 / 2, 98 / 2, 50 / 2, 0, 0, 0, 0, 64, // Copy alpha. }; memset(orig_pixels, 0, sizeof(orig_pixels)); @@ -579,10 +569,8 @@ TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) { // Matrix for Sepia. SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { - 17, 68, 35, 0, - 22, 88, 45, 0, - 24, 98, 50, 0, - 0, 0, 0, 0, // Unused but makes matrix 16 bytes. + 17, 68, 35, 0, 22, 88, 45, 0, + 24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes. }; memset(orig_pixels, 0, sizeof(orig_pixels)); @@ -642,10 +630,7 @@ TEST_F(LibYUVPlanarTest, TestARGBColorTable) { // Matrix for Sepia. static const uint8 kARGBTable[256 * 4] = { - 1u, 2u, 3u, 4u, - 5u, 6u, 7u, 8u, - 9u, 10u, 11u, 12u, - 13u, 14u, 15u, 16u, + 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u, }; orig_pixels[0][0] = 0u; @@ -701,10 +686,7 @@ TEST_F(LibYUVPlanarTest, TestRGBColorTable) { // Matrix for Sepia. static const uint8 kARGBTable[256 * 4] = { - 1u, 2u, 3u, 4u, - 5u, 6u, 7u, 8u, - 9u, 10u, 11u, 12u, - 13u, 14u, 15u, 16u, + 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u, }; orig_pixels[0][0] = 0u; @@ -762,8 +744,8 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) { orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - ARGBQuantize(&orig_pixels[0][0], 0, - (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); + ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, + 1280, 1); for (int i = 0; i < 1280; ++i) { EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]); @@ -772,8 +754,8 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) { EXPECT_EQ(i & 255, orig_pixels[i][3]); } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBQuantize(&orig_pixels[0][0], 0, - (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); + ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, + 1280, 1); } } @@ -1020,48 +1002,45 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { } } -#define TESTTERP(FMT_A, BPP_A, STRIDE_A, \ - FMT_B, BPP_B, STRIDE_B, \ - W1280, TERP, N, NEG, OFF) \ -TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF); \ - align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \ - align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \ - for (int i = 0; i < kStrideA * kHeight; ++i) { \ - src_argb_a[i + OFF] = (fastrand() & 0xff); \ - src_argb_b[i + OFF] = (fastrand() & 0xff); \ - } \ - MaskCpuFlags(disable_cpu_flags_); \ - ARGBInterpolate(src_argb_a + OFF, kStrideA, \ - src_argb_b + OFF, kStrideA, \ - dst_argb_c, kStrideB, \ - kWidth, NEG kHeight, TERP); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - ARGBInterpolate(src_argb_a + OFF, kStrideA, \ - src_argb_b + OFF, kStrideA, \ - dst_argb_opt, kStrideB, \ - kWidth, NEG kHeight, TERP); \ - } \ - for (int i = 0; i < kStrideB * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb_a); \ - free_aligned_buffer_page_end(src_argb_b); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ -} - -#define TESTINTERPOLATE(TERP) \ - TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \ - TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \ - TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \ - TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0) +#define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \ + N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStrideA = \ + (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \ + align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ + for (int i = 0; i < kStrideA * kHeight; ++i) { \ + src_argb_a[i + OFF] = (fastrand() & 0xff); \ + src_argb_b[i + OFF] = (fastrand() & 0xff); \ + } \ + MaskCpuFlags(disable_cpu_flags_); \ + ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \ + dst_argb_c, kStrideB, kWidth, NEG kHeight, TERP); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \ + dst_argb_opt, kStrideB, kWidth, NEG kHeight, TERP); \ + } \ + for (int i = 0; i < kStrideB * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb_a); \ + free_aligned_buffer_page_end(src_argb_b); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTINTERPOLATE(TERP) \ + TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \ + TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \ + TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \ + TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0) TESTINTERPOLATE(0) TESTINTERPOLATE(64) @@ -1069,9 +1048,13 @@ TESTINTERPOLATE(128) TESTINTERPOLATE(192) TESTINTERPOLATE(255) -static int TestBlend(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestBlend(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1093,22 +1076,17 @@ static int TestBlend(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 255, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBBlend(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, + kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBBlend(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, + dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1121,36 +1099,40 @@ static int TestBlend(int width, int height, int benchmark_iterations, } TEST_F(LibYUVPlanarTest, ARGBBlend_Any) { - int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) { - int max_diff = TestBlend(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) { - int max_diff = TestBlend(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { - int max_diff = TestBlend(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } -static void TestBlendPlane(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static void TestBlendPlane(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1170,21 +1152,15 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations, src_argb_b[i + off] = 255 - (i & 255); } memset(src_argb_alpha + off, 255, width); - BlendPlane(src_argb_a + off, width, - src_argb_b + off, width, - src_argb_alpha + off, width, - dst_argb_opt + off, width, - width, 1); + BlendPlane(src_argb_a + off, width, src_argb_b + off, width, + src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1); for (int i = 0; i < width; ++i) { EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]); } // Test destination is maintained exactly if alpha is 0. memset(src_argb_alpha + off, 0, width); - BlendPlane(src_argb_a + off, width, - src_argb_b + off, width, - src_argb_alpha + off, width, - dst_argb_opt + off, width, - width, 1); + BlendPlane(src_argb_a + off, width, src_argb_b + off, width, + src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1); for (int i = 0; i < width; ++i) { EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]); } @@ -1195,18 +1171,14 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations, } MaskCpuFlags(disable_cpu_flags); - BlendPlane(src_argb_a + off, width, - src_argb_b + off, width, - src_argb_alpha + off, width, - dst_argb_c + off, width, - width, height); + BlendPlane(src_argb_a + off, width, src_argb_b + off, width, + src_argb_alpha + off, width, dst_argb_c + off, width, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - BlendPlane(src_argb_a + off, width, - src_argb_b + off, width, - src_argb_alpha + off, width, - dst_argb_opt + off, width, - width, height); + BlendPlane(src_argb_a + off, width, src_argb_b + off, width, + src_argb_alpha + off, width, dst_argb_opt + off, width, width, + invert * height); } for (int i = 0; i < kStride * height; ++i) { EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]); @@ -1236,11 +1208,15 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Invert) { disable_cpu_flags_, benchmark_cpu_info_, -1, 1); } -#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) +#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) -static void TestI420Blend(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static void TestI420Blend(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { width = ((width) > 0) ? (width) : 1; const int kStrideUV = SUBSAMPLE(width, 2); const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2); @@ -1273,30 +1249,18 @@ static void TestI420Blend(int width, int height, int benchmark_iterations, memset(dst_v_opt, 255, kSizeUV + off); MaskCpuFlags(disable_cpu_flags); - I420Blend(src_y0 + off, width, - src_u0 + off, kStrideUV, - src_v0 + off, kStrideUV, - src_y1 + off, width, - src_u1 + off, kStrideUV, - src_v1 + off, kStrideUV, - src_a + off, width, - dst_y_c + off, width, - dst_u_c + off, kStrideUV, - dst_v_c + off, kStrideUV, - width, height); + I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off, + kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV, + src_v1 + off, kStrideUV, src_a + off, width, dst_y_c + off, width, + dst_u_c + off, kStrideUV, dst_v_c + off, kStrideUV, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - I420Blend(src_y0 + off, width, - src_u0 + off, kStrideUV, - src_v0 + off, kStrideUV, - src_y1 + off, width, - src_u1 + off, kStrideUV, - src_v1 + off, kStrideUV, - src_a + off, width, - dst_y_opt + off, width, - dst_u_opt + off, kStrideUV, - dst_v_opt + off, kStrideUV, - width, height); + I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off, + kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV, + src_v1 + off, kStrideUV, src_a + off, width, dst_y_opt + off, + width, dst_u_opt + off, kStrideUV, dst_v_opt + off, kStrideUV, + width, invert * height); } for (int i = 0; i < width * height; ++i) { EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]); @@ -1323,21 +1287,21 @@ static void TestI420Blend(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, I420Blend_Opt) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } // TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable. TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) { TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, I420Blend_Invert) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); } TEST_F(LibYUVPlanarTest, TestAffine) { @@ -1350,10 +1314,10 @@ TEST_F(LibYUVPlanarTest, TestAffine) { } } - float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f }; + float uv_step[4] = {0.f, 0.f, 0.75f, 0.f}; - ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], - uv_step, 1280); + ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], uv_step, + 1280); EXPECT_EQ(0u, interpolate_pixels_C[0][0]); EXPECT_EQ(96u, interpolate_pixels_C[128][0]); EXPECT_EQ(191u, interpolate_pixels_C[255][3]); @@ -1411,19 +1375,15 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) { // Disable all optimizations. MaskCpuFlags(disable_cpu_flags_); - double c_time = get_time(); for (j = 0; j < benchmark_iterations_; j++) { CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh); } - c_time = (get_time() - c_time) / benchmark_iterations_; // Enable optimizations. MaskCpuFlags(benchmark_cpu_info_); - double opt_time = get_time(); for (j = 0; j < benchmark_iterations_; j++) { CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh); } - opt_time = (get_time() - opt_time) / benchmark_iterations_; for (i = 0; i < y_plane_size; ++i) { if (dst_c[i] != dst_opt[i]) @@ -1437,9 +1397,13 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) { EXPECT_EQ(0, err); } -static int TestMultiply(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestMultiply(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1457,22 +1421,17 @@ static int TestMultiply(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBMultiply(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, + kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBMultiply(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, + dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1486,35 +1445,39 @@ static int TestMultiply(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) { int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) { - int max_diff = TestMultiply(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) { - int max_diff = TestMultiply(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) { - int max_diff = TestMultiply(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } -static int TestAdd(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestAdd(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1532,22 +1495,17 @@ static int TestAdd(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBAdd(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, + kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBAdd(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt, + kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1560,36 +1518,40 @@ static int TestAdd(int width, int height, int benchmark_iterations, } TEST_F(LibYUVPlanarTest, ARGBAdd_Any) { - int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) { - int max_diff = TestAdd(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) { - int max_diff = TestAdd(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) { - int max_diff = TestAdd(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } -static int TestSubtract(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestSubtract(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1607,22 +1569,17 @@ static int TestSubtract(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBSubtract(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, + kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBSubtract(src_argb_a + off, kStride, - src_argb_b + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, + dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1636,35 +1593,39 @@ static int TestSubtract(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) { int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) { - int max_diff = TestSubtract(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) { - int max_diff = TestSubtract(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) { - int max_diff = TestSubtract(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } -static int TestSobel(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestSobel(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1681,20 +1642,17 @@ static int TestSobel(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBSobel(src_argb_a + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBSobel(src_argb_a + off, kStride, dst_argb_c, kStride, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBSobel(src_argb_a + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBSobel(src_argb_a + off, kStride, dst_argb_opt, kStride, width, + invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1706,36 +1664,40 @@ static int TestSobel(int width, int height, int benchmark_iterations, } TEST_F(LibYUVPlanarTest, ARGBSobel_Any) { - int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) { - int max_diff = TestSobel(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) { - int max_diff = TestSobel(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) { - int max_diff = TestSobel(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } -static int TestSobelToPlane(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestSobelToPlane(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1754,20 +1716,17 @@ static int TestSobelToPlane(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kDstStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBSobelToPlane(src_argb_a + off, kSrcStride, - dst_argb_c, kDstStride, - width, invert * height); + ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_c, kDstStride, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBSobelToPlane(src_argb_a + off, kSrcStride, - dst_argb_opt, kDstStride, + ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_opt, kDstStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kDstStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1780,39 +1739,39 @@ static int TestSobelToPlane(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) { int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } -static int TestSobelXY(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off) { +static int TestSobelXY(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off) { if (width < 1) { width = 1; } @@ -1829,20 +1788,17 @@ static int TestSobelXY(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBSobelXY(src_argb_a + off, kStride, - dst_argb_c, kStride, - width, invert * height); + ARGBSobelXY(src_argb_a + off, kStride, dst_argb_c, kStride, width, + invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBSobelXY(src_argb_a + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + ARGBSobelXY(src_argb_a + off, kStride, dst_argb_opt, kStride, width, + invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1855,35 +1811,40 @@ static int TestSobelXY(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) { int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) { - int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + int max_diff = + TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) { - int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + int max_diff = + TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) { - int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + int max_diff = + TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } -static int TestBlur(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off, int radius) { +static int TestBlur(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off, + int radius) { if (width < 1) { width = 1; } @@ -1901,22 +1862,19 @@ static int TestBlur(int width, int height, int benchmark_iterations, memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); - ARGBBlur(src_argb_a + off, kStride, - dst_argb_c, kStride, - reinterpret_cast<int32*>(dst_cumsum), width * 4, - width, invert * height, radius); + ARGBBlur(src_argb_a + off, kStride, dst_argb_c, kStride, + reinterpret_cast<int32*>(dst_cumsum), width * 4, width, + invert * height, radius); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { - ARGBBlur(src_argb_a + off, kStride, - dst_argb_opt, kStride, - reinterpret_cast<int32*>(dst_cumsum), width * 4, - width, invert * height, radius); + ARGBBlur(src_argb_a + off, kStride, dst_argb_opt, kStride, + reinterpret_cast<int32*>(dst_cumsum), width * 4, width, + invert * height, radius); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i]) - - static_cast<int>(dst_argb_opt[i])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1930,67 +1888,59 @@ static int TestBlur(int width, int height, int benchmark_iterations, static const int kBlurSize = 55; TEST_F(LibYUVPlanarTest, ARGBBlur_Any) { - int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, kBlurSize); + int max_diff = + TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1, kBlurSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0, kBlurSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, kBlurSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } static const int kBlurSmallSize = 5; TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) { - int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, kBlurSmallSize); + int max_diff = + TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1, kBlurSmallSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0, kBlurSmallSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) { - int max_diff = TestBlur(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, kBlurSmallSize); + int max_diff = + TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } @@ -2001,10 +1951,10 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { memset(orig_pixels, 0, sizeof(orig_pixels)); SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = { - 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 - 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x - 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x - 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x + 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 + 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x + 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x + 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x }; // Test blue @@ -2081,6 +2031,139 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { } } +int TestHalfFloatPlane(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + float scale, + int mask) { + int i, j; + const int y_plane_size = benchmark_width * benchmark_height * 2; + + align_buffer_page_end(orig_y, y_plane_size * 3); + uint8* dst_opt = orig_y + y_plane_size; + uint8* dst_c = orig_y + y_plane_size * 2; + + MemRandomize(orig_y, y_plane_size); + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 1, y_plane_size); + + for (i = 0; i < y_plane_size / 2; ++i) { + reinterpret_cast<uint16*>(orig_y)[i] &= mask; + } + + // Disable all optimizations. + MaskCpuFlags(disable_cpu_flags); + for (j = 0; j < benchmark_iterations; j++) { + HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2, + reinterpret_cast<uint16*>(dst_c), benchmark_width * 2, scale, + benchmark_width, benchmark_height); + } + + // Enable optimizations. + MaskCpuFlags(benchmark_cpu_info); + for (j = 0; j < benchmark_iterations; j++) { + HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2, + reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2, + scale, benchmark_width, benchmark_height); + } + + int max_diff = 0; + for (i = 0; i < y_plane_size / 2; ++i) { + int abs_diff = abs(static_cast<int>(reinterpret_cast<uint16*>(dst_c)[i]) - + static_cast<int>(reinterpret_cast<uint16*>(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +#if defined(__arm__) +static void EnableFlushDenormalToZero(void) { + uint32_t cw; + __asm__ __volatile__( + "vmrs %0, fpscr \n" + "orr %0, %0, #0x1000000 \n" + "vmsr fpscr, %0 \n" + : "=r"(cw)::"memory"); +} +#endif + +// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes +// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally +// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12. + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) { +// 32 bit arm rounding on denormal case is off by 1 compared to C. +#if defined(__arm__) + EnableFlushDenormalToZero(); +#endif + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 65536.0f, 65535); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f, 65535); + EXPECT_LE(diff, 1); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 4096.0f, 65535); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 1024.0f, 1023); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 512.0f, 511); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 4096.0f, 4095); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f / 4095.0f, 4095); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f, 2047); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) { + int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f, 4095); + EXPECT_LE(diff, 1); +} + TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) { SIMD_ALIGNED(uint8 orig_pixels[1280][4]); SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); @@ -2170,15 +2253,13 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) { memcpy(dst_pixels_c, dst_pixels_opt, kSize); MaskCpuFlags(disable_cpu_flags_); - ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, - dst_pixels_c, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { - ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, - dst_pixels_opt, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kSize; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); @@ -2200,15 +2281,13 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { memcpy(dst_pixels_c, dst_pixels_opt, kPixels); MaskCpuFlags(disable_cpu_flags_); - ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, - dst_pixels_c, benchmark_width_, - benchmark_width_, benchmark_height_); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { - ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, - dst_pixels_opt, benchmark_width_, - benchmark_width_, benchmark_height_); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); @@ -2230,15 +2309,13 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); - ARGBCopyYToAlpha(orig_pixels, benchmark_width_, - dst_pixels_c, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { - ARGBCopyYToAlpha(orig_pixels, benchmark_width_, - dst_pixels_opt, benchmark_width_ * 4, - benchmark_width_, benchmark_height_); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); @@ -2249,9 +2326,14 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { free_aligned_buffer_page_end(orig_pixels); } -static int TestARGBRect(int width, int height, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info, - int invert, int off, int bpp) { +static int TestARGBRect(int width, + int height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + int invert, + int off, + int bpp) { if (width < 1) { width = 1; } @@ -2282,9 +2364,8 @@ static int TestARGBRect(int width, int height, int benchmark_iterations, } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { - int abs_diff = - abs(static_cast<int>(dst_argb_c[i + off]) - - static_cast<int>(dst_argb_opt[i + off])); + int abs_diff = abs(static_cast<int>(dst_argb_c[i + off]) - + static_cast<int>(dst_argb_opt[i + off])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -2296,66 +2377,145 @@ static int TestARGBRect(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, ARGBRect_Any) { int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, 4); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1, 4); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Invert) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0, 4); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Opt) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, 4); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Any) { int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, 1); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 1, 1); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Invert) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - -1, 0, 1); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Opt) { - int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, - +1, 0, 1); + int max_diff = + TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_EQ(0, max_diff); } +TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels, kPixels * 2); + align_buffer_page_end(tmp_pixels_u, kPixels); + align_buffer_page_end(tmp_pixels_v, kPixels); + align_buffer_page_end(dst_pixels_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_c, kPixels * 2); + + MemRandomize(src_pixels, kPixels * 2); + MemRandomize(tmp_pixels_u, kPixels); + MemRandomize(tmp_pixels_v, kPixels); + MemRandomize(dst_pixels_opt, kPixels * 2); + MemRandomize(dst_pixels_c, kPixels * 2); + + MaskCpuFlags(disable_cpu_flags_); + SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, + tmp_pixels_v, benchmark_width_, benchmark_width_, + benchmark_height_); + MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + dst_pixels_c, benchmark_width_ * 2, benchmark_width_, + benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, + tmp_pixels_v, benchmark_width_, benchmark_width_, + benchmark_height_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(tmp_pixels_u); + free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} + +TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels, kPixels * 2); + align_buffer_page_end(tmp_pixels_u, kPixels); + align_buffer_page_end(tmp_pixels_v, kPixels); + align_buffer_page_end(dst_pixels_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_c, kPixels * 2); + + MemRandomize(src_pixels, kPixels * 2); + MemRandomize(tmp_pixels_u, kPixels); + MemRandomize(tmp_pixels_v, kPixels); + MemRandomize(dst_pixels_opt, kPixels * 2); + MemRandomize(dst_pixels_c, kPixels * 2); + + MaskCpuFlags(disable_cpu_flags_); + SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, + tmp_pixels_v, benchmark_width_, benchmark_width_, + benchmark_height_); + MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + dst_pixels_c, benchmark_width_ * 2, benchmark_width_, + benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, + benchmark_width_, tmp_pixels_v, benchmark_width_, + benchmark_width_, benchmark_height_); + } + MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, + benchmark_height_); + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(tmp_pixels_u); + free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} + } // namespace libyuv diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc index 9c83c356..d2003895 100644 --- a/files/unit_test/rotate_argb_test.cc +++ b/files/unit_test/rotate_argb_test.cc @@ -10,14 +10,16 @@ #include <stdlib.h> +#include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/rotate_argb.h" -#include "../unit_test/unit_test.h" namespace libyuv { -void TestRotateBpp(int src_width, int src_height, - int dst_width, int dst_height, +void TestRotateBpp(int src_width, + int src_height, + int dst_width, + int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, @@ -51,26 +53,22 @@ void TestRotateBpp(int src_width, int src_height, if (kBpp == 1) { MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - RotatePlane(src_argb, src_stride_argb, - dst_argb_c, dst_stride_argb, + RotatePlane(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { - RotatePlane(src_argb, src_stride_argb, - dst_argb_opt, dst_stride_argb, + RotatePlane(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb, src_width, src_height, mode); } } else if (kBpp == 4) { MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - ARGBRotate(src_argb, src_stride_argb, - dst_argb_c, dst_stride_argb, + ARGBRotate(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { - ARGBRotate(src_argb, src_stride_argb, - dst_argb_opt, dst_stride_argb, + ARGBRotate(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb, src_width, src_height, mode); } } @@ -85,112 +83,104 @@ void TestRotateBpp(int src_width, int src_height, free_aligned_buffer_page_end(src_argb); } -static void ARGBTestRotate(int src_width, int src_height, - int dst_width, int dst_height, +static void ARGBTestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { - TestRotateBpp(src_width, src_height, - dst_width, dst_height, - mode, benchmark_iterations, - disable_cpu_flags, benchmark_cpu_info, 4); + TestRotateBpp(src_width, src_height, dst_width, dst_height, mode, + benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 4); } TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) { - ARGBTestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate0, benchmark_iterations_, + ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) { - ARGBTestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate90, benchmark_iterations_, + ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) { - ARGBTestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate180, benchmark_iterations_, + ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) { - ARGBTestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate270, benchmark_iterations_, + ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } -static void TestRotatePlane(int src_width, int src_height, - int dst_width, int dst_height, +static void TestRotatePlane(int src_width, + int src_height, + int dst_width, + int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { - TestRotateBpp(src_width, src_height, - dst_width, dst_height, - mode, benchmark_iterations, - disable_cpu_flags, benchmark_cpu_info, 1); + TestRotateBpp(src_width, src_height, dst_width, dst_height, mode, + benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 1); } TEST_F(LibYUVRotateTest, RotatePlane0_Opt) { - TestRotatePlane(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate0, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane90_Opt) { - TestRotatePlane(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate90, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane180_Opt) { - TestRotatePlane(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate180, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane270_Opt) { - TestRotatePlane(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate270, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) { TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate0, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) { TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate90, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) { TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate180, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) { TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate270, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } } // namespace libyuv diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc index 07e2f73a..d04b96e9 100644 --- a/files/unit_test/rotate_test.cc +++ b/files/unit_test/rotate_test.cc @@ -10,17 +10,20 @@ #include <stdlib.h> +#include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/rotate.h" -#include "../unit_test/unit_test.h" namespace libyuv { -static void I420TestRotate(int src_width, int src_height, - int dst_width, int dst_height, +static void I420TestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, libyuv::RotationMode mode, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info) { + int disable_cpu_flags, + int benchmark_cpu_info) { if (src_width < 1) { src_width = 1; } @@ -50,26 +53,21 @@ static void I420TestRotate(int src_width, int src_height, memset(dst_i420_opt, 3, dst_i420_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - I420Rotate(src_i420, src_width, - src_i420 + src_i420_y_size, (src_width + 1) / 2, - src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2, - dst_i420_c, dst_width, + I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size, + (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size, + (src_width + 1) / 2, dst_i420_c, dst_width, dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_c + dst_i420_y_size + dst_i420_uv_size, - (dst_width + 1) / 2, - src_width, src_height, mode); + (dst_width + 1) / 2, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { - I420Rotate(src_i420, src_width, - src_i420 + src_i420_y_size, (src_width + 1) / 2, - src_i420 + src_i420_y_size + src_i420_uv_size, - (src_width + 1) / 2, - dst_i420_opt, dst_width, - dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2, - dst_i420_opt + dst_i420_y_size + dst_i420_uv_size, - (dst_width + 1) / 2, - src_width, src_height, mode); + I420Rotate( + src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2, + src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2, + dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size, + (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size, + (dst_width + 1) / 2, src_width, src_height, mode); } // Rotation should be exact. @@ -83,30 +81,26 @@ static void I420TestRotate(int src_width, int src_height, } TEST_F(LibYUVRotateTest, I420Rotate0_Opt) { - I420TestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate0, benchmark_iterations_, + I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate90_Opt) { - I420TestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate90, benchmark_iterations_, + I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate180_Opt) { - I420TestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate180, benchmark_iterations_, + I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate270_Opt) { - I420TestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate270, benchmark_iterations_, + I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } @@ -115,37 +109,40 @@ TEST_F(LibYUVRotateTest, I420Rotate270_Opt) { // tested by passing an odd width command line or environment variable. TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) { I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate0, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) { I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate90, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) { I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate180, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) { I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate270, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } -static void NV12TestRotate(int src_width, int src_height, - int dst_width, int dst_height, +static void NV12TestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, libyuv::RotationMode mode, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info) { + int disable_cpu_flags, + int benchmark_cpu_info) { if (src_width < 1) { src_width = 1; } @@ -176,23 +173,19 @@ static void NV12TestRotate(int src_width, int src_height, memset(dst_i420_opt, 3, dst_i420_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - NV12ToI420Rotate(src_nv12, src_width, - src_nv12 + src_nv12_y_size, (src_width + 1) & ~1, - dst_i420_c, dst_width, + NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size, + (src_width + 1) & ~1, dst_i420_c, dst_width, dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_c + dst_i420_y_size + dst_i420_uv_size, - (dst_width + 1) / 2, - src_width, src_height, mode); + (dst_width + 1) / 2, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { - NV12ToI420Rotate(src_nv12, src_width, - src_nv12 + src_nv12_y_size, (src_width + 1) & ~1, - dst_i420_opt, dst_width, + NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size, + (src_width + 1) & ~1, dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size, - (dst_width + 1) / 2, - src_width, src_height, mode); + (dst_width + 1) / 2, src_width, src_height, mode); } // Rotation should be exact. @@ -206,91 +199,79 @@ static void NV12TestRotate(int src_width, int src_height, } TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) { - NV12TestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate0, benchmark_iterations_, + NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) { - NV12TestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate90, benchmark_iterations_, + NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) { - NV12TestRotate(benchmark_width_, benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate180, benchmark_iterations_, + NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) { - NV12TestRotate(benchmark_width_, benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate270, benchmark_iterations_, + NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) { NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate0, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) { NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate90, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) { NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, - kRotate180, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) { NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, - kRotate270, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) { - NV12TestRotate(benchmark_width_, -benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate0, benchmark_iterations_, + NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) { - NV12TestRotate(benchmark_width_, -benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate90, benchmark_iterations_, + NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) { - NV12TestRotate(benchmark_width_, -benchmark_height_, - benchmark_width_, benchmark_height_, - kRotate180, benchmark_iterations_, + NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) { - NV12TestRotate(benchmark_width_, -benchmark_height_, - benchmark_height_, benchmark_width_, - kRotate270, benchmark_iterations_, + NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } - - - - } // namespace libyuv diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc index f99782f7..d11aec20 100644 --- a/files/unit_test/scale_argb_test.cc +++ b/files/unit_test/scale_argb_test.cc @@ -11,11 +11,11 @@ #include <stdlib.h> #include <time.h> +#include "../unit_test/unit_test.h" #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/scale_argb.h" #include "libyuv/video_common.h" -#include "../unit_test/unit_test.h" namespace libyuv { @@ -23,18 +23,22 @@ namespace libyuv { #define FILELINESTR(file, line) file ":" STRINGIZE(line) // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. -static int ARGBTestFilter(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode f, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info) { +static int ARGBTestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; const int b = 0; // 128 to test for padding/stride. - int64 src_argb_plane_size = (Abs(src_width) + b * 2) * - (Abs(src_height) + b * 2) * 4LL; + int64 src_argb_plane_size = + (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL; int src_stride_argb = (b * 2 + Abs(src_width)) * 4; align_buffer_page_end(src_argb, src_argb_plane_size); @@ -59,21 +63,18 @@ static int ARGBTestFilter(int src_width, int src_height, // Warm up both versions for consistent benchmarks. MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, - src_width, src_height, - dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, - dst_width, dst_height, f); + src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, + dst_stride_argb, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, - src_width, src_height, - dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, - dst_width, dst_height, f); + src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4, + dst_stride_argb, dst_width, dst_height, f); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, - src_width, src_height, - dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, - dst_width, dst_height, f); + src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, + dst_stride_argb, dst_width, dst_height, f); c_time = (get_time() - c_time); @@ -88,8 +89,8 @@ static int ARGBTestFilter(int src_width, int src_height, opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT - printf("filter %d - %8d us C - %8d us OPT\n", - f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); + printf("filter %d - %8d us C - %8d us OPT\n", f, + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference @@ -115,10 +116,14 @@ static int ARGBTestFilter(int src_width, int src_height, static const int kTileX = 8; static const int kTileY = 8; -static int TileARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +static int TileARGBScale(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, FilterMode filtering) { for (int y = 0; y < dst_height; y += kTileY) { for (int x = 0; x < dst_width; x += kTileX) { @@ -130,11 +135,9 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb, if (y + clip_height > dst_height) { clip_height = dst_height - y; } - int r = ARGBScaleClip(src_argb, src_stride_argb, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - x, y, clip_width, clip_height, filtering); + int r = ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, x, + y, clip_width, clip_height, filtering); if (r) { return r; } @@ -143,16 +146,19 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb, return 0; } -static int ARGBClipTestFilter(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode f, int benchmark_iterations) { +static int ARGBClipTestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } const int b = 128; - int64 src_argb_plane_size = (Abs(src_width) + b * 2) * - (Abs(src_height) + b * 2) * 4; + int64 src_argb_plane_size = + (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4; int src_stride_argb = (b * 2 + Abs(src_width)) * 4; align_buffer_page_end(src_argb, src_argb_plane_size); @@ -184,9 +190,8 @@ static int ARGBClipTestFilter(int src_width, int src_height, // Do full image, no clipping. double c_time = get_time(); ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, - src_width, src_height, - dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, - dst_width, dst_height, f); + src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, + dst_stride_argb, dst_width, dst_height, f); c_time = (get_time() - c_time); // Do tiled image, clipping scale to a tile at a time. @@ -200,8 +205,8 @@ static int ARGBClipTestFilter(int src_width, int src_height, opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of Full vs Tiled. - printf("filter %d - %8d us Full - %8d us Tiled\n", - f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); + printf("filter %d - %8d us Full - %8d us Tiled\n", f, + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); // Compare full scaled image vs tiled image. int max_diff = 0; @@ -226,32 +231,30 @@ static int ARGBClipTestFilter(int src_width, int src_height, #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast<int>((x / nom) * denom) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \ - int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom), \ - SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), \ - DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \ - int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom), \ - SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), \ - DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \ + int diff = ARGBTestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \ + int diff = ARGBClipTestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, 3) +#define TEST_FACTOR(name, nom, denom) \ + TEST_FACTOR1(name, None, nom, denom, 0) \ + TEST_FACTOR1(name, Linear, nom, denom, 3) \ + TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(name, Box, nom, denom, 3) TEST_FACTOR(2, 1, 2) TEST_FACTOR(4, 1, 4) @@ -265,39 +268,37 @@ TEST_FACTOR(3, 1, 3) #undef DX #define TEST_SCALETO1(name, width, height, filter, max_diff) \ - TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ - int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \ - width, height, \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ - int diff = ARGBTestFilter(width, height, \ - Abs(benchmark_width_), Abs(benchmark_height_), \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \ - int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \ - width, height, \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \ - int diff = ARGBClipTestFilter(width, height, \ - Abs(benchmark_width_), \ - Abs(benchmark_height_), \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } + TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ + int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ + int diff = ARGBTestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \ + int diff = \ + ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \ + kFilter##filter, benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \ + int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } /// Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 3) \ - TEST_SCALETO1(name, width, height, Bilinear, 3) +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(name, width, height, None, 0) \ + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) TEST_SCALETO(ARGBScale, 1, 1) TEST_SCALETO(ARGBScale, 320, 240) @@ -310,31 +311,33 @@ TEST_SCALETO(ARGBScale, 1280, 720) // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, - int clip_width, int clip_height, +int YUVToARGBScaleReference2(const uint8* src_y, + int src_stride_y, + const uint8* src_u, + int src_stride_u, + const uint8* src_v, + int src_stride_v, + uint32 /* src_fourcc */, // TODO: Add support. + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + uint32 /* dst_fourcc */, // TODO: Add support. + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4)); int r; - I420ToARGB(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - argb_buffer, src_width * 4, - src_width, src_height); - - r = ARGBScaleClip(argb_buffer, src_width * 4, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, - filtering); + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); + + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); free(argb_buffer); return r; } @@ -360,13 +363,15 @@ static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) { } // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. -static int YUVToARGBTestFilter(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode f, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info) { +static int YUVToARGBTestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations) { int64 src_y_plane_size = Abs(src_width) * Abs(src_height); - int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) * - ((Abs(src_height) + 1) / 2); + int64 src_uv_plane_size = + ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2); int src_stride_y = Abs(src_width); int src_stride_uv = (Abs(src_width) + 1) / 2; @@ -374,8 +379,8 @@ static int YUVToARGBTestFilter(int src_width, int src_height, align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); - int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL; - int dst_stride_argb = (dst_width) * 4; + int64 dst_argb_plane_size = (dst_width) * (dst_height)*4LL; + int dst_stride_argb = (dst_width)*4; align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) { @@ -390,28 +395,18 @@ static int YUVToARGBTestFilter(int src_width, int src_height, memset(dst_argb_c, 2, dst_argb_plane_size); memset(dst_argb_opt, 3, dst_argb_plane_size); - YUVToARGBScaleReference2(src_y, src_stride_y, - src_u, src_stride_uv, - src_v, src_stride_uv, - libyuv::FOURCC_I420, - src_width, src_height, - dst_argb_c, dst_stride_argb, - libyuv::FOURCC_I420, - dst_width, dst_height, - 0, 0, dst_width, dst_height, - f); + YUVToARGBScaleReference2(src_y, src_stride_y, src_u, src_stride_uv, src_v, + src_stride_uv, libyuv::FOURCC_I420, src_width, + src_height, dst_argb_c, dst_stride_argb, + libyuv::FOURCC_I420, dst_width, dst_height, 0, 0, + dst_width, dst_height, f); for (int i = 0; i < benchmark_iterations; ++i) { - YUVToARGBScaleClip(src_y, src_stride_y, - src_u, src_stride_uv, - src_v, src_stride_uv, - libyuv::FOURCC_I420, - src_width, src_height, - dst_argb_opt, dst_stride_argb, - libyuv::FOURCC_I420, - dst_width, dst_height, - 0, 0, dst_width, dst_height, - f); + YUVToARGBScaleClip(src_y, src_stride_y, src_u, src_stride_uv, src_v, + src_stride_uv, libyuv::FOURCC_I420, src_width, + src_height, dst_argb_opt, dst_stride_argb, + libyuv::FOURCC_I420, dst_width, dst_height, 0, 0, + dst_width, dst_height, f); } int max_diff = 0; for (int i = 0; i < dst_height; ++i) { @@ -419,9 +414,7 @@ static int YUVToARGBTestFilter(int src_width, int src_height, int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { - printf("error %d at %d,%d c %d opt %d", - abs_diff, - j, i, + printf("error %d at %d,%d c %d opt %d", abs_diff, j, i, dst_argb_c[(i * dst_stride_argb) + j], dst_argb_opt[(i * dst_stride_argb) + j]); EXPECT_LE(abs_diff, 40); @@ -439,24 +432,18 @@ static int YUVToARGBTestFilter(int src_width, int src_height, } TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) { - int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_, - benchmark_width_ * 3 / 2, - benchmark_height_ * 3 / 2, - libyuv::kFilterBilinear, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + int diff = + YUVToARGBTestFilter(benchmark_width_, benchmark_height_, + benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, + libyuv::kFilterBilinear, benchmark_iterations_); EXPECT_LE(diff, 10); } TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) { - int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2, - benchmark_height_ * 3 / 2, - benchmark_width_, benchmark_height_, - libyuv::kFilterBilinear, - benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_); + int diff = YUVToARGBTestFilter( + benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_, + benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_); EXPECT_LE(diff, 10); } - } // namespace libyuv diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc index f40443e2..0b4ec30b 100644 --- a/files/unit_test/scale_test.cc +++ b/files/unit_test/scale_test.cc @@ -11,9 +11,9 @@ #include <stdlib.h> #include <time.h> +#include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" -#include "../unit_test/unit_test.h" #define STRINGIZE(line) #line #define FILELINESTR(file, line) file ":" STRINGIZE(line) @@ -21,10 +21,14 @@ namespace libyuv { // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. -static int TestFilter(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode f, int benchmark_iterations, - int disable_cpu_flags, int benchmark_cpu_info) { +static int TestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } @@ -41,9 +45,8 @@ static int TestFilter(int src_width, int src_height, int src_stride_uv = b * 2 + src_width_uv; align_buffer_page_end(src_y, src_y_plane_size) - align_buffer_page_end(src_u, src_uv_plane_size) - align_buffer_page_end(src_v, src_uv_plane_size) - if (!src_y || !src_u || !src_v) { + align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end( + src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } @@ -61,13 +64,15 @@ static int TestFilter(int src_width, int src_height, int dst_stride_uv = b * 2 + dst_width_uv; align_buffer_page_end(dst_y_c, dst_y_plane_size) - align_buffer_page_end(dst_u_c, dst_uv_plane_size) - align_buffer_page_end(dst_v_c, dst_uv_plane_size) - align_buffer_page_end(dst_y_opt, dst_y_plane_size) - align_buffer_page_end(dst_u_opt, dst_uv_plane_size) - align_buffer_page_end(dst_v_opt, dst_uv_plane_size) - if (!dst_y_c || !dst_u_c || !dst_v_c || - !dst_y_opt|| !dst_u_opt|| !dst_v_opt) { + align_buffer_page_end(dst_u_c, dst_uv_plane_size) + align_buffer_page_end(dst_v_c, dst_uv_plane_size) + align_buffer_page_end(dst_y_opt, dst_y_plane_size) + align_buffer_page_end(dst_u_opt, dst_uv_plane_size) + align_buffer_page_end( + dst_v_opt, + dst_uv_plane_size) if (!dst_y_c || !dst_u_c || + !dst_v_c || !dst_y_opt || + !dst_u_opt || !dst_v_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } @@ -76,12 +81,11 @@ static int TestFilter(int src_width, int src_height, double c_time = get_time(); I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, - src_width, src_height, - dst_y_c + (dst_stride_y * b) + b, dst_stride_y, + src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, + src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y, dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, - dst_width, dst_height, f); + dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, + dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. @@ -89,19 +93,16 @@ static int TestFilter(int src_width, int src_height, for (i = 0; i < benchmark_iterations; ++i) { I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, - src_width, src_height, - dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, + src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, + src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, - dst_width, dst_height, f); + dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, + dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT - printf("filter %d - %8d us C - %8d us OPT\n", - f, - static_cast<int>(c_time * 1e6), - static_cast<int>(opt_time * 1e6)); + printf("filter %d - %8d us C - %8d us OPT\n", f, + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference @@ -133,25 +134,27 @@ static int TestFilter(int src_width, int src_height, } } - free_aligned_buffer_page_end(dst_y_c) - free_aligned_buffer_page_end(dst_u_c) - free_aligned_buffer_page_end(dst_v_c) - free_aligned_buffer_page_end(dst_y_opt) - free_aligned_buffer_page_end(dst_u_opt) - free_aligned_buffer_page_end(dst_v_opt) + free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c) + free_aligned_buffer_page_end(dst_v_c) + free_aligned_buffer_page_end(dst_y_opt) + free_aligned_buffer_page_end(dst_u_opt) + free_aligned_buffer_page_end(dst_v_opt) - free_aligned_buffer_page_end(src_y) - free_aligned_buffer_page_end(src_u) - free_aligned_buffer_page_end(src_v) + free_aligned_buffer_page_end(src_y) + free_aligned_buffer_page_end(src_u) + free_aligned_buffer_page_end(src_v) - return max_diff; + return max_diff; } // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. // 0 = exact. -static int TestFilter_16(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode f, int benchmark_iterations) { +static int TestFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } @@ -161,20 +164,18 @@ static int TestFilter_16(int src_width, int src_height, int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; - int64 src_y_plane_size = (Abs(src_width) + b * 2) * - (Abs(src_height) + b * 2); + int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2); int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2); int src_stride_y = b * 2 + Abs(src_width); int src_stride_uv = b * 2 + src_width_uv; - align_buffer_page_end(src_y, src_y_plane_size) - align_buffer_page_end(src_u, src_uv_plane_size) - align_buffer_page_end(src_v, src_uv_plane_size) - align_buffer_page_end(src_y_16, src_y_plane_size * 2) - align_buffer_page_end(src_u_16, src_uv_plane_size * 2) - align_buffer_page_end(src_v_16, src_uv_plane_size * 2) - uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16); + align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end( + src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size) + align_buffer_page_end(src_y_16, src_y_plane_size * 2) + align_buffer_page_end(src_u_16, src_uv_plane_size * 2) + align_buffer_page_end(src_v_16, src_uv_plane_size * 2) + uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16); uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16); uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16); @@ -205,34 +206,33 @@ static int TestFilter_16(int src_width, int src_height, int dst_stride_uv = b * 2 + dst_width_uv; align_buffer_page_end(dst_y_8, dst_y_plane_size) - align_buffer_page_end(dst_u_8, dst_uv_plane_size) - align_buffer_page_end(dst_v_8, dst_uv_plane_size) - align_buffer_page_end(dst_y_16, dst_y_plane_size * 2) - align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2) - align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2) - - uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16); + align_buffer_page_end(dst_u_8, dst_uv_plane_size) + align_buffer_page_end(dst_v_8, dst_uv_plane_size) + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2) + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2) + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2) + + uint16* p_dst_y_16 = + reinterpret_cast<uint16*>(dst_y_16); uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16); uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16); I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, - src_v + (src_stride_uv * b) + b, src_stride_uv, - src_width, src_height, - dst_y_8 + (dst_stride_y * b) + b, dst_stride_y, + src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, + src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y, dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, - dst_width, dst_height, f); + dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, + dst_height, f); for (i = 0; i < benchmark_iterations; ++i) { I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y, p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv, - p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, - src_width, src_height, - p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y, + p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width, + src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y, p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv, - p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, - dst_width, dst_height, f); + p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, + dst_height, f); } // Expect an exact match @@ -262,21 +262,20 @@ static int TestFilter_16(int src_width, int src_height, } } - free_aligned_buffer_page_end(dst_y_8) - free_aligned_buffer_page_end(dst_u_8) - free_aligned_buffer_page_end(dst_v_8) - free_aligned_buffer_page_end(dst_y_16) - free_aligned_buffer_page_end(dst_u_16) - free_aligned_buffer_page_end(dst_v_16) - - free_aligned_buffer_page_end(src_y) - free_aligned_buffer_page_end(src_u) - free_aligned_buffer_page_end(src_v) - free_aligned_buffer_page_end(src_y_16) - free_aligned_buffer_page_end(src_u_16) - free_aligned_buffer_page_end(src_v_16) - - return max_diff; + free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8) + free_aligned_buffer_page_end(dst_v_8) + free_aligned_buffer_page_end(dst_y_16) + free_aligned_buffer_page_end(dst_u_16) + free_aligned_buffer_page_end(dst_v_16) + + free_aligned_buffer_page_end(src_y) + free_aligned_buffer_page_end(src_u) + free_aligned_buffer_page_end(src_v) + free_aligned_buffer_page_end(src_y_16) + free_aligned_buffer_page_end(src_u_16) + free_aligned_buffer_page_end(src_v_16) + + return max_diff; } // The following adjustments in dimensions ensure the scale factor will be @@ -285,32 +284,30 @@ static int TestFilter_16(int src_width, int src_height, #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \ - int diff = TestFilter(SX(benchmark_width_, nom, denom), \ - SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), \ - DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \ - int diff = TestFilter_16(SX(benchmark_width_, nom, denom), \ - SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), \ - DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \ + int diff = TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \ + int diff = TestFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom, boxdiff) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, boxdiff) +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(name, None, nom, denom, 0) \ + TEST_FACTOR1(name, Linear, nom, denom, 3) \ + TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(name, Box, nom, denom, boxdiff) TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) @@ -323,42 +320,40 @@ TEST_FACTOR(3, 1, 3, 0) #undef SX #undef DX -#define TEST_SCALETO1(name, width, height, filter, max_diff) \ - TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ - int diff = TestFilter(benchmark_width_, benchmark_height_, \ - width, height, \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ - int diff = TestFilter(width, height, \ - Abs(benchmark_width_), Abs(benchmark_height_), \ - kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, \ - DISABLED_##name##To##width##x##height##_##filter##_16) { \ - int diff = TestFilter_16(benchmark_width_, benchmark_height_, \ - width, height, \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, \ - DISABLED_##name##From##width##x##height##_##filter##_16) { \ - int diff = TestFilter_16(width, height, \ - Abs(benchmark_width_), Abs(benchmark_height_), \ - kFilter##filter, benchmark_iterations_); \ - EXPECT_LE(diff, max_diff); \ - } +#define TEST_SCALETO1(name, width, height, filter, max_diff) \ + TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ + int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ + int diff = TestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##name##To##width##x##height##_##filter##_16) { \ + int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##name##From##width##x##height##_##filter##_16) { \ + int diff = TestFilter_16(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_); \ + EXPECT_LE(diff, max_diff); \ + } // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 0) \ - TEST_SCALETO1(name, width, height, Bilinear, 0) \ - TEST_SCALETO1(name, width, height, Box, 0) +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(name, width, height, None, 0) \ + TEST_SCALETO1(name, width, height, Linear, 0) \ + TEST_SCALETO1(name, width, height, Bilinear, 0) \ + TEST_SCALETO1(name, width, height, Box, 0) TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240) diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc index e75510fd..7f8bcf8f 100644 --- a/files/unit_test/unit_test.cc +++ b/files/unit_test/unit_test.cc @@ -25,18 +25,21 @@ unsigned int fastrand_seed = 0xfb; DEFINE_int32(libyuv_width, 0, "width of test image."); DEFINE_int32(libyuv_height, 0, "height of test image."); DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test."); -DEFINE_int32(libyuv_flags, 0, - "cpu flags for reference code. 1 = C, -1 = SIMD"); -DEFINE_int32(libyuv_cpu_info, 0, +DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD"); +DEFINE_int32(libyuv_cpu_info, + 0, "cpu flags for benchmark code. 1 = C, -1 = SIMD"); // For quicker unittests, default is 128 x 72. But when benchmarking, // default to 720p. Allow size to specify. // Set flags to -1 for benchmarking to avoid slower C code. -LibYUVConvertTest::LibYUVConvertTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVConvertTest::LibYUVConvertTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -76,19 +79,26 @@ LibYUVConvertTest::LibYUVConvertTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } -LibYUVColorTest::LibYUVColorTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVColorTest::LibYUVColorTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -128,19 +138,26 @@ LibYUVColorTest::LibYUVColorTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } -LibYUVScaleTest::LibYUVScaleTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVScaleTest::LibYUVScaleTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -180,19 +197,26 @@ LibYUVScaleTest::LibYUVScaleTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } -LibYUVRotateTest::LibYUVRotateTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVRotateTest::LibYUVRotateTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -232,19 +256,26 @@ LibYUVRotateTest::LibYUVRotateTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } -LibYUVPlanarTest::LibYUVPlanarTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVPlanarTest::LibYUVPlanarTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -284,19 +315,26 @@ LibYUVPlanarTest::LibYUVPlanarTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } -LibYUVBaseTest::LibYUVBaseTest() : - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { +LibYUVBaseTest::LibYUVBaseTest() + : benchmark_iterations_(BENCHMARK_ITERATIONS), + benchmark_width_(128), + benchmark_height_(72), + disable_cpu_flags_(1), + benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -336,14 +374,18 @@ LibYUVBaseTest::LibYUVBaseTest() : if (FLAGS_libyuv_cpu_info) { benchmark_cpu_info_ = FLAGS_libyuv_cpu_info; } - benchmark_pixels_div256_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); - benchmark_pixels_div1280_ = static_cast<int>(( - static_cast<double>(Abs(benchmark_width_)) * - static_cast<double>(Abs(benchmark_height_)) * - static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); + benchmark_pixels_div256_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 255.0) / + 256.0); + benchmark_pixels_div1280_ = + static_cast<int>((static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + + 1279.0) / + 1280.0); } int main(int argc, char** argv) { diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h index f2c4bef0..f7d60a76 100644 --- a/files/unit_test/unit_test.h +++ b/files/unit_test/unit_test.h @@ -14,8 +14,8 @@ #ifdef WIN32 #include <windows.h> #else -#include <sys/time.h> #include <sys/resource.h> +#include <sys/time.h> #endif #include <gtest/gtest.h> @@ -54,8 +54,10 @@ static __inline int Abs(int v) { static const int kMaxWidth = 32768; static const int kMaxHeight = 32768; -static inline bool SizeValid(int src_width, int src_height, - int dst_width, int dst_height) { +static inline bool SizeValid(int src_width, + int src_height, + int dst_width, + int dst_height) { if (src_width > kMaxWidth || src_height > kMaxHeight || dst_width > kMaxWidth || dst_height > kMaxHeight) { printf("Warning - size too large to test. Skipping\n"); @@ -64,15 +66,16 @@ static inline bool SizeValid(int src_width, int src_height, return true; } -#define align_buffer_page_end(var, size) \ - uint8* var; \ - uint8* var##_mem; \ - var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \ - var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \ - (size)) & ~63); +#define align_buffer_page_end(var, size) \ + uint8* var; \ + uint8* var##_mem; \ + var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \ + var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \ + (size)) & \ + ~63); #define free_aligned_buffer_page_end(var) \ - free(var##_mem); \ + free(var##_mem); \ var = 0; #ifdef WIN32 @@ -122,78 +125,78 @@ class LibYUVColorTest : public ::testing::Test { protected: LibYUVColorTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVConvertTest : public ::testing::Test { protected: LibYUVConvertTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVScaleTest : public ::testing::Test { protected: LibYUVScaleTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVRotateTest : public ::testing::Test { protected: LibYUVRotateTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVPlanarTest : public ::testing::Test { protected: LibYUVPlanarTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVBaseTest : public ::testing::Test { protected: LibYUVBaseTest(); - int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. - int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. - int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. + int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. + int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. + int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. - int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. - int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. + int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. + int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; #endif // UNIT_TEST_UNIT_TEST_H_ NOLINT diff --git a/files/unit_test/video_common_test.cc b/files/unit_test/video_common_test.cc index ac97d0f3..f16b6772 100644 --- a/files/unit_test/video_common_test.cc +++ b/files/unit_test/video_common_test.cc @@ -11,26 +11,23 @@ #include <stdlib.h> #include <string.h> -#include "libyuv/video_common.h" #include "../unit_test/unit_test.h" +#include "libyuv/video_common.h" namespace libyuv { // Tests FourCC codes in video common, which are used for ConvertToI420(). static bool TestValidChar(uint32 onecc) { - if ((onecc >= '0' && onecc <= '9') || - (onecc >= 'A' && onecc <= 'Z') || - (onecc >= 'a' && onecc <= 'z') || - (onecc == ' ') || (onecc == 0xff)) { + if ((onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') || + (onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff)) { return true; } return false; } static bool TestValidFourCC(uint32 fourcc, int bpp) { - if (!TestValidChar(fourcc & 0xff) || - !TestValidChar((fourcc >> 8) & 0xff) || + if (!TestValidChar(fourcc & 0xff) || !TestValidChar((fourcc >> 8) & 0xff) || !TestValidChar((fourcc >> 16) & 0xff) || !TestValidChar((fourcc >> 24) & 0xff)) { return false; @@ -42,23 +39,23 @@ static bool TestValidFourCC(uint32 fourcc, int bpp) { } TEST_F(LibYUVBaseTest, TestCanonicalFourCC) { - EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV)); - EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_YU12)); - EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16)); - EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24)); - EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV)); - EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUVS)); - EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_HDYC)); - EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY)); - EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG)); - EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1)); - EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_RGB3)); - EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3)); - EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32)); - EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_CM24)); - EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_L555)); - EXPECT_EQ(FOURCC_RGBP, CanonicalFourCC(FOURCC_L565)); - EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551)); + EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV)); + EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12)); + EXPECT_EQ(static_cast<uint32>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16)); + EXPECT_EQ(static_cast<uint32>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24)); + EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV)); + EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS)); + EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC)); + EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY)); + EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG)); + EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1)); + EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3)); + EXPECT_EQ(static_cast<uint32>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3)); + EXPECT_EQ(static_cast<uint32>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32)); + EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24)); + EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555)); + EXPECT_EQ(static_cast<uint32>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565)); + EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551)); } TEST_F(LibYUVBaseTest, TestFourCC) { @@ -66,7 +63,6 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420)); EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422)); EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444)); - EXPECT_TRUE(TestValidFourCC(FOURCC_I411, FOURCC_BPP_I411)); EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400)); EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21)); EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12)); @@ -78,7 +74,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA)); EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR)); EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG)); - EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW)); + EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO)); @@ -101,7 +97,7 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3)); EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3)); EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264)); - EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY)); + EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY)); } } // namespace libyuv diff --git a/files/util/compare.cc b/files/util/compare.cc index c36c0fa5..ef0beefa 100644 --- a/files/util/compare.cc +++ b/files/util/compare.cc @@ -39,10 +39,12 @@ int main(int argc, char** argv) { int amt2 = 0; do { amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1)); - if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1); + if (amt1 > 0) + hash1 = libyuv::HashDjb2(buf1, amt1, hash1); if (fin2) { amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2)); - if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2); + if (amt2 > 0) + hash2 = libyuv::HashDjb2(buf2, amt2, hash2); int amt_min = (amt1 < amt2) ? amt1 : amt2; size_min += amt_min; sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min); @@ -52,8 +54,8 @@ int main(int argc, char** argv) { printf("hash1 %x", hash1); if (fin2) { printf(", hash2 %x", hash2); - double mse = static_cast<double>(sum_square_err) / - static_cast<double>(size_min); + double mse = + static_cast<double>(sum_square_err) / static_cast<double>(size_min); printf(", mse %.2f", mse); double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min); printf(", psnr %.2f\n", psnr); diff --git a/files/util/convert.cc b/files/util/convert.cc index 5f071416..acaf43ad 100644 --- a/files/util/convert.cc +++ b/files/util/convert.cc @@ -29,13 +29,13 @@ bool verbose = false; bool attenuate = false; bool unattenuate = false; int image_width = 0, image_height = 0; // original width and height -int dst_width = 0, dst_height = 0; // new width and height +int dst_width = 0, dst_height = 0; // new width and height int fileindex_org = 0; // argv argument contains the original file name. int fileindex_rec = 0; // argv argument contains the reconstructed file name. -int num_rec = 0; // Number of reconstructed images. -int num_skip_org = 0; // Number of frames to skip in original. -int num_frames = 0; // Number of frames to convert. -int filter = 1; // Bilinear filter for scaling. +int num_rec = 0; // Number of reconstructed images. +int num_skip_org = 0; // Number of frames to skip in original. +int num_frames = 0; // Number of frames to convert. +int filter = 1; // Bilinear filter for scaling. static __inline uint32 Abs(int32 v) { return v >= 0 ? v : -v; @@ -48,8 +48,8 @@ bool ExtractResolutionFromFilename(const char* name, // Isolate the .width_height. section of the filename by searching for a // dot or underscore followed by a digit. for (int i = 0; name[i]; ++i) { - if ((name[i] == '.' || name[i] == '_') && - name[i + 1] >= '0' && name[i + 1] <= '9') { + if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && + name[i + 1] <= '9') { int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT if (2 == n) { return true; @@ -59,13 +59,14 @@ bool ExtractResolutionFromFilename(const char* name, return false; } -void PrintHelp(const char * program) { +void PrintHelp(const char* program) { printf("%s [-options] src_argb.raw dst_yuv.raw\n", program); - printf(" -s <width> <height> .... specify source resolution. " - "Optional if name contains\n" - " resolution (ie. " - "name.1920x800_24Hz_P420.yuv)\n" - " Negative value mirrors.\n"); + printf( + " -s <width> <height> .... specify source resolution. " + "Optional if name contains\n" + " resolution (ie. " + "name.1920x800_24Hz_P420.yuv)\n" + " Negative value mirrors.\n"); printf(" -d <width> <height> .... specify destination resolution.\n"); printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n"); printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n"); @@ -78,7 +79,8 @@ void PrintHelp(const char * program) { } void ParseOptions(int argc, const char* argv[]) { - if (argc <= 1) PrintHelp(argv[0]); + if (argc <= 1) + PrintHelp(argv[0]); for (int c = 1; c < argc; ++c) { if (!strcmp(argv[c], "-v")) { verbose = true; @@ -89,17 +91,17 @@ void ParseOptions(int argc, const char* argv[]) { } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { PrintHelp(argv[0]); } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { - image_width = atoi(argv[++c]); // NOLINT - image_height = atoi(argv[++c]); // NOLINT + image_width = atoi(argv[++c]); // NOLINT + image_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-d") && c + 2 < argc) { - dst_width = atoi(argv[++c]); // NOLINT - dst_height = atoi(argv[++c]); // NOLINT + dst_width = atoi(argv[++c]); // NOLINT + dst_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) { - num_skip_org = atoi(argv[++c]); // NOLINT + num_skip_org = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { - num_frames = atoi(argv[++c]); // NOLINT + num_frames = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-f") && c + 1 < argc) { - filter = atoi(argv[++c]); // NOLINT + filter = atoi(argv[++c]); // NOLINT } else if (argv[c][0] == '-') { fprintf(stderr, "Unknown option. %s\n", argv[c]); } else if (fileindex_org == 0) { @@ -127,11 +129,9 @@ void ParseOptions(int argc, const char* argv[]) { int org_width, org_height; int rec_width, rec_height; bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], - &org_width, - &org_height); + &org_width, &org_height); bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], - &rec_width, - &rec_height); + &rec_width, &rec_height); if (image_width == 0 || image_height == 0) { if (org_res_avail) { image_width = org_width; @@ -158,10 +158,14 @@ void ParseOptions(int argc, const char* argv[]) { static const int kTileX = 32; static const int kTileY = 32; -static int TileARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +static int TileARGBScale(const uint8* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, libyuv::FilterMode filtering) { for (int y = 0; y < dst_height; y += kTileY) { for (int x = 0; x < dst_width; x += kTileX) { @@ -173,11 +177,10 @@ static int TileARGBScale(const uint8* src_argb, int src_stride_argb, if (y + clip_height > dst_height) { clip_height = dst_height - y; } - int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - x, y, clip_width, clip_height, filtering); + int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width, + src_height, dst_argb, dst_stride_argb, + dst_width, dst_height, x, y, clip_width, + clip_height, filtering); if (r) { return r; } @@ -197,8 +200,8 @@ int main(int argc, const char* argv[]) { } // Open all files to convert to - FILE** file_rec = new FILE* [num_rec]; - memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT + FILE** file_rec = new FILE*[num_rec]; + memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb"); if (file_rec[cur_rec] == NULL) { @@ -222,8 +225,8 @@ int main(int argc, const char* argv[]) { // Input is YUV if (org_is_yuv) { const int y_size = Abs(image_width) * Abs(image_height); - const int uv_size = ((Abs(image_width) + 1) / 2) * - ((Abs(image_height) + 1) / 2); + const int uv_size = + ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2); org_size = y_size + 2 * uv_size; // YUV original. } @@ -233,8 +236,8 @@ int main(int argc, const char* argv[]) { const size_t total_size = y_size + 2 * uv_size; #if defined(_MSC_VER) _fseeki64(file_org, - static_cast<__int64>(num_skip_org) * - static_cast<__int64>(org_size), SEEK_SET); + static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size), + SEEK_SET); #else fseek(file_org, num_skip_org * total_size, SEEK_SET); #endif @@ -256,18 +259,18 @@ int main(int argc, const char* argv[]) { } if (verbose) { - printf("Size: %dx%d to %dx%d\n", image_width, image_height, - dst_width, dst_height); + printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width, + dst_height); } int number_of_frames; - for (number_of_frames = 0; ; ++number_of_frames) { + for (number_of_frames = 0;; ++number_of_frames) { if (num_frames && number_of_frames >= num_frames) break; // Load original YUV or ARGB frame. - size_t bytes_org = fread(ch_org, sizeof(uint8), - static_cast<size_t>(org_size), file_org); + size_t bytes_org = + fread(ch_org, sizeof(uint8), static_cast<size_t>(org_size), file_org); if (bytes_org < static_cast<size_t>(org_size)) break; @@ -290,22 +293,17 @@ int main(int argc, const char* argv[]) { int half_src_height = (src_height + 1) / 2; int half_dst_width = (dst_width + 1) / 2; int half_dst_height = (dst_height + 1) / 2; - I420Scale(ch_org, src_width, - ch_org + src_width * src_height, half_src_width, - ch_org + src_width * src_height + - half_src_width * half_src_height, half_src_width, - image_width, image_height, - ch_rec, dst_width, - ch_rec + dst_width * dst_height, half_dst_width, - ch_rec + dst_width * dst_height + - half_dst_width * half_dst_height, half_dst_width, - dst_width, dst_height, - static_cast<libyuv::FilterMode>(filter)); + I420Scale( + ch_org, src_width, ch_org + src_width * src_height, half_src_width, + ch_org + src_width * src_height + half_src_width * half_src_height, + half_src_width, image_width, image_height, ch_rec, dst_width, + ch_rec + dst_width * dst_height, half_dst_width, + ch_rec + dst_width * dst_height + half_dst_width * half_dst_height, + half_dst_width, dst_width, dst_height, + static_cast<libyuv::FilterMode>(filter)); } else { - TileARGBScale(ch_org, Abs(image_width) * 4, - image_width, image_height, - ch_dst, dst_width * 4, - dst_width, dst_height, + TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height, + ch_dst, dst_width * 4, dst_width, dst_height, static_cast<libyuv::FilterMode>(filter)); } bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL; @@ -321,25 +319,24 @@ int main(int argc, const char* argv[]) { if (!org_is_yuv && rec_is_yuv) { int half_width = (dst_width + 1) / 2; int half_height = (dst_height + 1) / 2; - libyuv::ARGBToI420(ch_dst, dst_width * 4, - ch_rec, dst_width, - ch_rec + dst_width * dst_height, half_width, - ch_rec + dst_width * dst_height + - half_width * half_height, half_width, - dst_width, dst_height); + libyuv::ARGBToI420( + ch_dst, dst_width * 4, ch_rec, dst_width, + ch_rec + dst_width * dst_height, half_width, + ch_rec + dst_width * dst_height + half_width * half_height, + half_width, dst_width, dst_height); } // Output YUV or ARGB frame. if (rec_is_yuv) { - size_t bytes_rec = fwrite(ch_rec, sizeof(uint8), - static_cast<size_t>(total_size), - file_rec[cur_rec]); + size_t bytes_rec = + fwrite(ch_rec, sizeof(uint8), static_cast<size_t>(total_size), + file_rec[cur_rec]); if (bytes_rec < static_cast<size_t>(total_size)) break; } else { - size_t bytes_rec = fwrite(ch_dst, sizeof(uint8), - static_cast<size_t>(dst_size), - file_rec[cur_rec]); + size_t bytes_rec = + fwrite(ch_dst, sizeof(uint8), static_cast<size_t>(dst_size), + file_rec[cur_rec]); if (bytes_rec < static_cast<size_t>(dst_size)) break; } diff --git a/files/util/cpuid.c b/files/util/cpuid.c index 94e245b1..9716f115 100644 --- a/files/util/cpuid.c +++ b/files/util/cpuid.c @@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) { int has_avx3 = TestCpuFlag(kCpuHasAVX3); int has_erms = TestCpuFlag(kCpuHasERMS); int has_fma3 = TestCpuFlag(kCpuHasFMA3); + int has_f16c = TestCpuFlag(kCpuHasF16C); printf("Has SSE2 %x\n", has_sse2); printf("Has SSSE3 %x\n", has_ssse3); printf("Has SSE4.1 %x\n", has_sse41); @@ -88,6 +89,7 @@ int main(int argc, const char* argv[]) { printf("Has AVX3 %x\n", has_avx3); printf("Has ERMS %x\n", has_erms); printf("Has FMA3 %x\n", has_fma3); + printf("Has F16C %x\n", has_f16c); } return 0; } diff --git a/files/util/psnr.cc b/files/util/psnr.cc index 52b04bd5..27f876c0 100644 --- a/files/util/psnr.cc +++ b/files/util/psnr.cc @@ -27,7 +27,7 @@ typedef unsigned __int64 uint64; #else // COMPILER_MSVC #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long uint64; // NOLINT -#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long long uint64; // NOLINT #endif // __LP64__ #endif // _MSC_VER @@ -39,85 +39,81 @@ typedef unsigned long long uint64; // NOLINT !defined(__aarch64__) #define HAS_SUMSQUAREERROR_NEON static uint32 SumSquareError_NEON(const uint8* src_a, - const uint8* src_b, int count) { + const uint8* src_b, + int count) { volatile uint32 sse; - asm volatile ( - "vmov.u8 q7, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" + asm volatile( + "vmov.u8 q7, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" - "1: \n" - "vld1.u8 {q0}, [%0]! \n" - "vld1.u8 {q1}, [%1]! \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q7, d4, d4 \n" - "vmlal.s16 q8, d6, d6 \n" - "vmlal.s16 q8, d5, d5 \n" - "vmlal.s16 q10, d7, d7 \n" - "subs %2, %2, #16 \n" - "bhi 1b \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" + "vld1.u8 {q1}, [%1]! \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q7, d4, d4 \n" + "vmlal.s16 q8, d6, d6 \n" + "vmlal.s16 q8, d5, d5 \n" + "vmlal.s16 q10, d7, d7 \n" + "subs %2, %2, #16 \n" + "bhi 1b \n" - "vadd.u32 q7, q7, q8 \n" - "vadd.u32 q9, q9, q10 \n" - "vadd.u32 q10, q7, q9 \n" - "vpaddl.u32 q1, q10 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); + "vadd.u32 q7, q7, q8 \n" + "vadd.u32 q9, q9, q10 \n" + "vadd.u32 q10, q7, q9 \n" + "vpaddl.u32 q1, q10 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); return sse; } #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SUMSQUAREERROR_NEON static uint32 SumSquareError_NEON(const uint8* src_a, - const uint8* src_b, int count) { + const uint8* src_b, + int count) { volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_SUMSQUAREERROR_SSE2 -__declspec(naked) -static uint32 SumSquareError_SSE2(const uint8* /*src_a*/, - const uint8* /*src_b*/, int /*count*/) { +__declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/, + const uint8* /*src_b*/, + int /*count*/) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 sub edx, eax @@ -151,47 +147,49 @@ static uint32 SumSquareError_SSE2(const uint8* /*src_a*/, #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 static uint32 SumSquareError_SSE2(const uint8* src_a, - const uint8* src_b, int count) { + const uint8* src_b, + int count) { uint32 sse; - asm volatile ( // NOLINT - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" + asm volatile( // NOLINT + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu (%0,%1,1),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqu %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqu %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "1: \n" + "movdqu (%0),%%xmm1 \n" + "movdqu (%0,%1,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqu %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqu %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - : - : "memory", "cc" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + : + : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + , + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif - ); // NOLINT + ); // NOLINT return sse; } #endif // LIBYUV_DISABLE_X86 etc @@ -199,20 +197,22 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, #if defined(HAS_SUMSQUAREERROR_SSE2) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); + asm volatile( // NOLINT + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type)); } // For gcc/clang but not clangcl. -#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER) +#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__)) static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); + asm volatile( // NOLINT + "cpuid \n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type)); } #endif @@ -229,7 +229,8 @@ static int CpuHasSSE2() { #endif // HAS_SUMSQUAREERROR_SSE2 static uint32 SumSquareError_C(const uint8* src_a, - const uint8* src_b, int count) { + const uint8* src_b, + int count) { uint32 sse = 0u; for (int x = 0; x < count; ++x) { int diff = src_a[x] - src_b[x]; @@ -239,9 +240,10 @@ static uint32 SumSquareError_C(const uint8* src_a, } double ComputeSumSquareError(const uint8* src_a, - const uint8* src_b, int count) { - uint32 (*SumSquareError)(const uint8* src_a, - const uint8* src_b, int count) = SumSquareError_C; + const uint8* src_b, + int count) { + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) SumSquareError = SumSquareError_NEON; #endif @@ -253,7 +255,7 @@ double ComputeSumSquareError(const uint8* src_a, const int kBlockSize = 1 << 15; uint64 sse = 0; #ifdef _OPENMP -#pragma omp parallel for reduction(+: sse) +#pragma omp parallel for reduction(+ : sse) #endif for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); diff --git a/files/util/psnr_main.cc b/files/util/psnr_main.cc index 0518ab84..4d930be4 100644 --- a/files/util/psnr_main.cc +++ b/files/util/psnr_main.cc @@ -71,8 +71,8 @@ bool ExtractResolutionFromFilename(const char* name, // Isolate the .width_height. section of the filename by searching for a // dot or underscore followed by a digit. for (int i = 0; name[i]; ++i) { - if ((name[i] == '.' || name[i] == '_') && - name[i + 1] >= '0' && name[i + 1] <= '9') { + if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && + name[i + 1] <= '9') { int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT if (2 == n) { return true; @@ -88,7 +88,7 @@ bool ExtractResolutionFromFilename(const char* name, return false; } fseek(file_org, 0, SEEK_END); - size_t total_size = ftell(file_org); + size_t total_size = ftell(file_org); fseek(file_org, 0, SEEK_SET); uint8* const ch_org = new uint8[total_size]; memset(ch_org, 0, total_size); @@ -109,8 +109,10 @@ bool ExtractResolutionFromFilename(const char* name, // This can be useful when comparing codecs that are inconsistant about Y uint8 ScaleY(uint8 y) { int ny = (y - 16) * 256 / 224; - if (ny < 0) ny = 0; - if (ny > 255) ny = 255; + if (ny < 0) + ny = 0; + if (ny > 255) + ny = 255; return static_cast<uint8>(ny); } @@ -119,16 +121,18 @@ double GetMSE(double sse, double size) { return sse / size; } -void PrintHelp(const char * program) { +void PrintHelp(const char* program) { printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program); #ifdef HAVE_JPEG printf("jpeg or raw YUV 420 supported.\n"); #endif printf("options:\n"); - printf(" -s <width> <height> .... specify YUV size, mandatory if none of the " - "sequences have the\n"); - printf(" resolution embedded in their filename (ie. " - "name.1920x800_24Hz_P420.yuv)\n"); + printf( + " -s <width> <height> .... specify YUV size, mandatory if none of the " + "sequences have the\n"); + printf( + " resolution embedded in their filename (ie. " + "name.1920x800_24Hz_P420.yuv)\n"); printf(" -psnr .................. compute PSNR (default)\n"); printf(" -ssim .................. compute SSIM\n"); printf(" -mse ................... compute MSE\n"); @@ -146,7 +150,8 @@ void PrintHelp(const char * program) { } void ParseOptions(int argc, const char* argv[]) { - if (argc <= 1) PrintHelp(argv[0]); + if (argc <= 1) + PrintHelp(argv[0]); for (int c = 1; c < argc; ++c) { if (!strcmp(argv[c], "-v")) { verbose = true; @@ -168,16 +173,16 @@ void ParseOptions(int argc, const char* argv[]) { } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { PrintHelp(argv[0]); } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { - image_width = atoi(argv[++c]); // NOLINT - image_height = atoi(argv[++c]); // NOLINT + image_width = atoi(argv[++c]); // NOLINT + image_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) { - num_skip_org = atoi(argv[++c]); // NOLINT - num_skip_rec = atoi(argv[++c]); // NOLINT + num_skip_org = atoi(argv[++c]); // NOLINT + num_skip_rec = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { - num_frames = atoi(argv[++c]); // NOLINT + num_frames = atoi(argv[++c]); // NOLINT #ifdef _OPENMP } else if (!strcmp(argv[c], "-t") && c + 1 < argc) { - num_threads = atoi(argv[++c]); // NOLINT + num_threads = atoi(argv[++c]); // NOLINT #endif } else if (argv[c][0] == '-') { fprintf(stderr, "Unknown option. %s\n", argv[c]); @@ -206,11 +211,9 @@ void ParseOptions(int argc, const char* argv[]) { int org_width, org_height; int rec_width, rec_height; bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], - &org_width, - &org_height); + &org_width, &org_height); bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], - &rec_width, - &rec_height); + &rec_width, &rec_height); if (org_res_avail) { if (rec_res_avail) { if ((org_width == rec_width) && (org_height == rec_height)) { @@ -234,11 +237,15 @@ void ParseOptions(int argc, const char* argv[]) { } } -bool UpdateMetrics(uint8* ch_org, uint8* ch_rec, - const int y_size, const int uv_size, const size_t total_size, +bool UpdateMetrics(uint8* ch_org, + uint8* ch_rec, + const int y_size, + const int uv_size, + const size_t total_size, int number_of_frames, metric* cur_distortion_psnr, - metric* distorted_frame, bool do_psnr) { + metric* distorted_frame, + bool do_psnr) { const int uv_offset = (do_swap_uv ? uv_size : 0); const uint8* const u_org = ch_org + y_size + uv_offset; const uint8* const u_rec = ch_rec + y_size; @@ -247,11 +254,11 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec, if (do_psnr) { #ifdef HAVE_JPEG double y_err = static_cast<double>( - libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size)); + libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size)); double u_err = static_cast<double>( - libyuv::ComputeSumSquareError(u_org, u_rec, uv_size)); + libyuv::ComputeSumSquareError(u_org, u_rec, uv_size)); double v_err = static_cast<double>( - libyuv::ComputeSumSquareError(v_org, v_rec, uv_size)); + libyuv::ComputeSumSquareError(v_org, v_rec, uv_size)); #else double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size); double u_err = ComputeSumSquareError(u_org, u_rec, uv_size); @@ -265,17 +272,17 @@ bool UpdateMetrics(uint8* ch_org, uint8* ch_rec, distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size)); distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size)); distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size)); - distorted_frame->all = ComputePSNR(total_err, - static_cast<double>(total_size)); + distorted_frame->all = + ComputePSNR(total_err, static_cast<double>(total_size)); } else { distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height); - distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2, - (image_height + 1) / 2); - distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2, - (image_height + 1) / 2); + distorted_frame->u = + CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2); + distorted_frame->v = + CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2); distorted_frame->all = - (distorted_frame->y + distorted_frame->u + distorted_frame->v) - / total_size; + (distorted_frame->y + distorted_frame->u + distorted_frame->v) / + total_size; distorted_frame->y /= y_size; distorted_frame->u /= uv_size; distorted_frame->v /= uv_size; @@ -330,8 +337,8 @@ int main(int argc, const char* argv[]) { } // Open all files to compare to - FILE** file_rec = new FILE* [num_rec]; - memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT + FILE** file_rec = new FILE*[num_rec]; + memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb"); if (file_rec[cur_rec] == NULL) { @@ -347,20 +354,21 @@ int main(int argc, const char* argv[]) { const int y_size = image_width * image_height; const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2); - const size_t total_size = y_size + 2 * uv_size; // NOLINT + const size_t total_size = y_size + 2 * uv_size; // NOLINT #if defined(_MSC_VER) - _fseeki64(file_org, - static_cast<__int64>(num_skip_org) * - static_cast<__int64>(total_size), SEEK_SET); + _fseeki64( + file_org, + static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size), + SEEK_SET); #else fseek(file_org, num_skip_org * total_size, SEEK_SET); #endif for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { #if defined(_MSC_VER) - _fseeki64(file_rec[cur_rec], - static_cast<__int64>(num_skip_rec) * - static_cast<__int64>(total_size), - SEEK_SET); + _fseeki64( + file_rec[cur_rec], + static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size), + SEEK_SET); #else fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET); #endif @@ -420,7 +428,7 @@ int main(int argc, const char* argv[]) { } int number_of_frames; - for (number_of_frames = 0; ; ++number_of_frames) { + for (number_of_frames = 0;; ++number_of_frames) { if (num_frames && number_of_frames >= num_frames) break; @@ -432,17 +440,11 @@ int main(int argc, const char* argv[]) { memcpy(ch_jpeg, ch_org, bytes_org); memset(ch_org, 0, total_size); - if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, - ch_org, - image_width, - ch_org + y_size, - (image_width + 1) / 2, + if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width, + ch_org + y_size, (image_width + 1) / 2, ch_org + y_size + uv_size, - (image_width + 1) / 2, - image_width, - image_height, - image_width, - image_height)) { + (image_width + 1) / 2, image_width, + image_height, image_width, image_height)) { delete[] ch_jpeg; break; } @@ -453,8 +455,8 @@ int main(int argc, const char* argv[]) { } for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - size_t bytes_rec = fread(ch_rec, sizeof(uint8), - total_size, file_rec[cur_rec]); + size_t bytes_rec = + fread(ch_rec, sizeof(uint8), total_size, file_rec[cur_rec]); if (bytes_rec < total_size) { #ifdef HAVE_JPEG // Try parsing file as a jpeg. @@ -462,17 +464,11 @@ int main(int argc, const char* argv[]) { memcpy(ch_jpeg, ch_rec, bytes_rec); memset(ch_rec, 0, total_size); - if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, - ch_rec, - image_width, - ch_rec + y_size, - (image_width + 1) / 2, + if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width, + ch_rec + y_size, (image_width + 1) / 2, ch_rec + y_size + uv_size, - (image_width + 1) / 2, - image_width, - image_height, - image_width, - image_height)) { + (image_width + 1) / 2, image_width, + image_height, image_width, image_height)) { delete[] ch_jpeg; break; } @@ -488,10 +484,8 @@ int main(int argc, const char* argv[]) { if (do_psnr) { metric distorted_frame; metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; - bool ismin = UpdateMetrics(ch_org, ch_rec, - y_size, uv_size, total_size, - number_of_frames, - cur_distortion_psnr, + bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, + number_of_frames, cur_distortion_psnr, &distorted_frame, true); if (verbose) { printf("\t%10.6f", distorted_frame.y); @@ -504,10 +498,8 @@ int main(int argc, const char* argv[]) { if (do_ssim) { metric distorted_frame; metric* cur_distortion_ssim = &distortion_ssim[cur_rec]; - bool ismin = UpdateMetrics(ch_org, ch_rec, - y_size, uv_size, total_size, - number_of_frames, - cur_distortion_ssim, + bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, + number_of_frames, cur_distortion_ssim, &distorted_frame, false); if (verbose) { printf("\t%10.6f", distorted_frame.y); @@ -543,24 +535,20 @@ int main(int argc, const char* argv[]) { } if (do_psnr) { - const double global_psnr_y = ComputePSNR( - cur_distortion_psnr->global_y, - static_cast<double>(y_size) * number_of_frames); - const double global_psnr_u = ComputePSNR( - cur_distortion_psnr->global_u, - static_cast<double>(uv_size) * number_of_frames); - const double global_psnr_v = ComputePSNR( - cur_distortion_psnr->global_v, - static_cast<double>(uv_size) * number_of_frames); - const double global_psnr_all = ComputePSNR( - cur_distortion_psnr->global_all, - static_cast<double>(total_size) * number_of_frames); - printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - global_psnr_y, - global_psnr_u, - global_psnr_v, - global_psnr_all, - number_of_frames); + const double global_psnr_y = + ComputePSNR(cur_distortion_psnr->global_y, + static_cast<double>(y_size) * number_of_frames); + const double global_psnr_u = + ComputePSNR(cur_distortion_psnr->global_u, + static_cast<double>(uv_size) * number_of_frames); + const double global_psnr_v = + ComputePSNR(cur_distortion_psnr->global_v, + static_cast<double>(uv_size) * number_of_frames); + const double global_psnr_all = + ComputePSNR(cur_distortion_psnr->global_all, + static_cast<double>(total_size) * number_of_frames); + printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y, + global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames); if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } @@ -570,20 +558,14 @@ int main(int argc, const char* argv[]) { if (!quiet) { printf("Avg:"); if (do_psnr) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_psnr->y, - cur_distortion_psnr->u, - cur_distortion_psnr->v, - cur_distortion_psnr->all, - number_of_frames); + printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y, + cur_distortion_psnr->u, cur_distortion_psnr->v, + cur_distortion_psnr->all, number_of_frames); } if (do_ssim) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_ssim->y, - cur_distortion_ssim->u, - cur_distortion_ssim->v, - cur_distortion_ssim->all, - number_of_frames); + printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y, + cur_distortion_ssim->u, cur_distortion_ssim->v, + cur_distortion_ssim->all, number_of_frames); } if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); @@ -594,19 +576,15 @@ int main(int argc, const char* argv[]) { printf("Min:"); if (do_psnr) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_psnr->min_y, - cur_distortion_psnr->min_u, - cur_distortion_psnr->min_v, - cur_distortion_psnr->min_all, - cur_distortion_psnr->min_frame); + cur_distortion_psnr->min_y, cur_distortion_psnr->min_u, + cur_distortion_psnr->min_v, cur_distortion_psnr->min_all, + cur_distortion_psnr->min_frame); } if (do_ssim) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_ssim->min_y, - cur_distortion_ssim->min_u, - cur_distortion_ssim->min_v, - cur_distortion_ssim->min_all, - cur_distortion_ssim->min_frame); + cur_distortion_ssim->min_y, cur_distortion_ssim->min_u, + cur_distortion_ssim->min_v, cur_distortion_ssim->min_all, + cur_distortion_ssim->min_frame); } if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); @@ -615,20 +593,20 @@ int main(int argc, const char* argv[]) { } if (do_mse) { - double global_mse_y = GetMSE(cur_distortion_psnr->global_y, - static_cast<double>(y_size) * number_of_frames); - double global_mse_u = GetMSE(cur_distortion_psnr->global_u, - static_cast<double>(uv_size) * number_of_frames); - double global_mse_v = GetMSE(cur_distortion_psnr->global_v, - static_cast<double>(uv_size) * number_of_frames); - double global_mse_all = GetMSE(cur_distortion_psnr->global_all, - static_cast<double>(total_size) * number_of_frames); - printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - global_mse_y, - global_mse_u, - global_mse_v, - global_mse_all, - number_of_frames); + double global_mse_y = + GetMSE(cur_distortion_psnr->global_y, + static_cast<double>(y_size) * number_of_frames); + double global_mse_u = + GetMSE(cur_distortion_psnr->global_u, + static_cast<double>(uv_size) * number_of_frames); + double global_mse_v = + GetMSE(cur_distortion_psnr->global_v, + static_cast<double>(uv_size) * number_of_frames); + double global_mse_all = + GetMSE(cur_distortion_psnr->global_all, + static_cast<double>(total_size) * number_of_frames); + printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y, + global_mse_u, global_mse_v, global_mse_all, number_of_frames); if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } diff --git a/files/util/ssim.cc b/files/util/ssim.cc index 5a6399b7..43e725d8 100644 --- a/files/util/ssim.cc +++ b/files/util/ssim.cc @@ -16,11 +16,11 @@ extern "C" { #endif -typedef unsigned int uint32; // NOLINT -typedef unsigned short uint16; // NOLINT +typedef unsigned int uint32; // NOLINT +typedef unsigned short uint16; // NOLINT #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ - (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) + (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) #define __SSE2__ #endif #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) @@ -38,22 +38,29 @@ enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; // The maximum value (11 x 11) must be less than 128 to avoid sign // problems during the calls to _mm_mullo_epi16(). static const int K[KERNEL_SIZE] = { - 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) + 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) }; static const double kiW[KERNEL + 1 + 1] = { - 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] - 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] - 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] - 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] - 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] + 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] + 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] + 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] + 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] + 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] }; #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) -#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product -#define MAKE_WEIGHT(L) \ - { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \ - PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } } +#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product +#define MAKE_WEIGHT(L) \ + { \ + { \ + { \ + PWEIGHT(L, 0) \ + , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \ + PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \ + } \ + } \ + } // We need this union trick to be able to initialize constant static __m128i // values. We can't call _mm_set_epi16() for static compile-time initialization. @@ -62,32 +69,36 @@ static const struct { uint16 i16_[8]; __m128i m_; } values_; -} W0 = MAKE_WEIGHT(0), - W1 = MAKE_WEIGHT(1), - W2 = MAKE_WEIGHT(2), +} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2), W3 = MAKE_WEIGHT(3); - // ... the rest is symmetric. +// ... the rest is symmetric. #undef MAKE_WEIGHT #undef PWEIGHT #endif // Common final expression for SSIM, once the weighted sums are known. -static double FinalizeSSIM(double iw, double xm, double ym, - double xxm, double xym, double yym) { +static double FinalizeSSIM(double iw, + double xm, + double ym, + double xxm, + double xym, + double yym) { const double iwx = xm * iw; const double iwy = ym * iw; double sxx = xxm * iw - iwx * iwx; double syy = yym * iw - iwy * iwy; // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - if (syy < 0.) syy = 0.; + if (sxx < 0.) + sxx = 0.; + if (syy < 0.) + syy = 0.; const double sxsy = sqrt(sxx * syy); const double sxy = xym * iw - iwx * iwy; static const double C11 = (0.01 * 0.01) * (255 * 255); static const double C22 = (0.03 * 0.03) * (255 * 255); static const double C33 = (0.015 * 0.015) * (255 * 255); const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); - const double c = (2. * sxsy + C22) / (sxx + syy + C22); + const double c = (2. * sxsy + C22) / (sxx + syy + C22); const double s = (sxy + C33) / (sxsy + C33); return l * c * s; } @@ -98,15 +109,21 @@ static double FinalizeSSIM(double iw, double xm, double ym, // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) // with a diff of 255, squared. The maximum error is thus 0x4388241, // which fits into 32 bits integers. -double GetSSIM(const uint8 *org, const uint8 *rec, - int xo, int yo, int W, int H, int stride) { +double GetSSIM(const uint8* org, + const uint8* rec, + int xo, + int yo, + int W, + int H, + int stride) { uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; org += (yo - KERNEL) * stride; org += (xo - KERNEL); rec += (yo - KERNEL) * stride; rec += (xo - KERNEL); for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { - if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue; + if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) + continue; const int Wy = K[y_]; for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { const int Wxy = Wy * K[x_]; @@ -114,8 +131,8 @@ double GetSSIM(const uint8 *org, const uint8 *rec, const int org_x = org[x_]; const int rec_x = rec[x_]; ws += Wxy; - xm += Wxy * org_x; - ym += Wxy * rec_x; + xm += Wxy * org_x; + ym += Wxy * rec_x; xxm += Wxy * org_x * org_x; xym += Wxy * org_x * rec_x; yym += Wxy * rec_x * rec_x; @@ -125,8 +142,11 @@ double GetSSIM(const uint8 *org, const uint8 *rec, return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); } -double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, - int xo, int yo, int stride, +double GetSSIMFullKernel(const uint8* org, + const uint8* rec, + int xo, + int yo, + int stride, double area_weight) { uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; @@ -161,8 +181,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, const int ll2 = rec[dy2 - x]; const int lr2 = rec[dy2 + x]; - xm += Wxy * (ul1 + ur1 + ll1 + lr1); - ym += Wxy * (ul2 + ur2 + ll2 + lr2); + xm += Wxy * (ul1 + ur1 + ll1 + lr1); + ym += Wxy * (ul2 + ur2 + ll2 + lr2); xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); @@ -189,8 +209,8 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, const int l2 = rec[-y]; const int r2 = rec[y]; - xm += Wxy * (u1 + d1 + l1 + r1); - ym += Wxy * (u2 + d2 + l2 + r2); + xm += Wxy * (u1 + d1 + l1 + r1); + ym += Wxy * (u2 + d2 + l2 + r2); xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); @@ -201,13 +221,13 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, const int s1 = org[0]; const int s2 = rec[0]; - xm += Wxy * s1; - ym += Wxy * s2; + xm += Wxy * s1; + ym += Wxy * s2; xxm += Wxy * s1 * s1; xym += Wxy * s1 * s2; yym += Wxy * s2 * s2; -#else // __SSE2__ +#else // __SSE2__ org += (yo - KERNEL) * stride + (xo - KERNEL); rec += (yo - KERNEL) * stride + (xo - KERNEL); @@ -221,29 +241,31 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, // Read 8 pixels at line #L, and convert to 16bit, perform weighting // and acccumulate. -#define LOAD_LINE_PAIR(L, WEIGHT) do { \ - const __m128i v0 = \ - _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \ - const __m128i v1 = \ - _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \ - const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ - const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ - const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ - const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ - x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ - x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ - xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ - xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ - yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ -} while (0) - -#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \ - uint32 tmp[4]; \ - _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ - (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ -} while (0) +#define LOAD_LINE_PAIR(L, WEIGHT) \ + do { \ + const __m128i v0 = \ + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \ + const __m128i v1 = \ + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \ + const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ + const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ + const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ + const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ + x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ + y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ + x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ + y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ + xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ + xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ + yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ + } while (0) + +#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \ + do { \ + uint32 tmp[4]; \ + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ + (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ + } while (0) LOAD_LINE_PAIR(0, W0); LOAD_LINE_PAIR(1, W1); @@ -266,10 +288,14 @@ double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); } -static int start_max(int x, int y) { return (x > y) ? x : y; } +static int start_max(int x, int y) { + return (x > y) ? x : y; +} -double CalcSSIM(const uint8 *org, const uint8 *rec, - const int image_width, const int image_height) { +double CalcSSIM(const uint8* org, + const uint8* rec, + const int image_width, + const int image_height) { double SSIM = 0.; const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; @@ -284,7 +310,7 @@ double CalcSSIM(const uint8 *org, const uint8 *rec, } #ifdef _OPENMP - #pragma omp parallel for reduction(+: SSIM) +#pragma omp parallel for reduction(+ : SSIM) #endif for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { for (int i = 0; i < KERNEL_X; ++i) { @@ -302,8 +328,8 @@ double CalcSSIM(const uint8 *org, const uint8 *rec, // NOTE: we could use similar method for the left-most pixels too. const int kScratchWidth = 8; const int kScratchStride = kScratchWidth + KERNEL + 1; - uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 }; - uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 }; + uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0}; + uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0}; for (int k = 0; k < KERNEL_SIZE; ++k) { const int offset = @@ -311,9 +337,9 @@ double CalcSSIM(const uint8 *org, const uint8 *rec, memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); } - for (int k = 0; k <= KERNEL_X + 1; ++k) { - SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, - KERNEL + k, KERNEL, kScratchStride, kiW[k]); + for (int k = 0; k <= KERNEL_X + 1; ++k) { + SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL, + kScratchStride, kiW[k]); } } } @@ -333,4 +359,3 @@ double CalcLSSIM(double ssim) { #ifdef __cplusplus } // extern "C" #endif - diff --git a/files/util/ssim.h b/files/util/ssim.h index 430eb71c..4647f45d 100644 --- a/files/util/ssim.h +++ b/files/util/ssim.h @@ -10,7 +10,7 @@ // Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -#ifndef UTIL_SSIM_H_ // NOLINT +#ifndef UTIL_SSIM_H_ #define UTIL_SSIM_H_ #include <math.h> // For log10() @@ -24,8 +24,10 @@ typedef unsigned char uint8; #define UINT8_TYPE_DEFINED #endif -double CalcSSIM(const uint8* org, const uint8* rec, - const int image_width, const int image_height); +double CalcSSIM(const uint8* org, + const uint8* rec, + const int image_width, + const int image_height); double CalcLSSIM(double ssim); @@ -33,4 +35,4 @@ double CalcLSSIM(double ssim); } // extern "C" #endif -#endif // UTIL_SSIM_H_ // NOLINT +#endif // UTIL_SSIM_H_ |