diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-02-03 14:21:07 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-02-03 10:53:02 +0000 |
commit | fc61dde1eb4b7807201fa20cd0a7d023363558b2 (patch) | |
tree | 9cb82fea30f1da03c77e51d4f8f8c6ed6d9d6024 | |
parent | c28d4049364d75710b1c49697a5814ab572af641 (diff) | |
download | libyuv-fc61dde1eb4b7807201fa20cd0a7d023363558b2.tar.gz |
Add special optimization for I420ToI444 and I422ToI444
These functions use (bi)linear filter, to scale U and V planes to the size of Y plane.
This will help enhance the quality of YUV to RGB conversion.
Also added 10bit and 12bit version:
I010ToI410
I210ToI410
I012ToI412
I212ToI412
libyuv_unittest --gtest_filter=LibYUVConvertTest.I42*ToI444*:LibYUVConvertTest.I*1*ToI41*
R=fbarchard@chromium.org
Change-Id: Ie4a711a5ba28f2ff1f44c021f7a5c149022264c5
Bug: libyuv:872
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2658097
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/convert.h | 74 | ||||
-rw-r--r-- | include/libyuv/scale.h | 12 | ||||
-rw-r--r-- | include/libyuv/scale_row.h | 163 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert_from.cc | 96 | ||||
-rw-r--r-- | source/scale.cc | 280 | ||||
-rw-r--r-- | source/scale_any.cc | 185 | ||||
-rw-r--r-- | source/scale_common.cc | 89 | ||||
-rw-r--r-- | source/scale_gcc.cc | 834 | ||||
-rw-r--r-- | source/scale_neon.cc | 194 | ||||
-rw-r--r-- | source/scale_neon64.cc | 190 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 66 |
13 files changed, 2156 insertions, 31 deletions
diff --git a/README.chromium b/README.chromium index f49e57c8..d27d1aa3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1772 +Version: 1774 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 026b153c..50ffc2f0 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -89,6 +89,23 @@ int I422ToI420(const uint8_t* src_y, int width, int height); +// Convert I422 to I444. +LIBYUV_API +int I422ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, @@ -122,6 +139,23 @@ int I420Copy(const uint8_t* src_y, int width, int height); +// Convert I420 to I444. +LIBYUV_API +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // Copy I010 to I010 #define I010ToI010 I010Copy #define H010ToH010 I010Copy @@ -159,6 +193,46 @@ int I010ToI420(const uint16_t* src_y, int width, int height); +// Convert I010 to I410 +LIBYUV_API +int I010ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert I012 to I412 +#define I012ToI412 I010ToI410 + +// Convert I212 to I412 +LIBYUV_API +int I210ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert I212 to I412 +#define I212ToI412 I210ToI410 + // Convert I400 (grey) to I420. LIBYUV_API int I400ToI420(const uint8_t* src_y, diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h index add5a9eb..d06f8b52 100644 --- a/include/libyuv/scale.h +++ b/include/libyuv/scale.h @@ -49,6 +49,18 @@ void ScalePlane_16(const uint16_t* src, int dst_height, enum FilterMode filtering); +// Sample is expected to be in the low 12 bits. +LIBYUV_API +void ScalePlane_12(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering); + // Scales a YUV 4:2:0 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index a386d499..ee77d228 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -77,6 +77,12 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_SCALEUVROWDOWN2BOX_SSSE3 +#define HAS_SCALECOLUP2LINEAR_SSE2 +#define HAS_SCALECOLUP2LINEAR_SSSE3 +#define HAS_SCALEROWUP2LINEAR_SSE2 +#define HAS_SCALEROWUP2LINEAR_SSSE3 +#define HAS_SCALECOLUP2LINEAR_16_SSE2 +#define HAS_SCALEROWUP2LINEAR_16_SSE2 #endif // The following are available for gcc/clang x86 platforms, but @@ -86,6 +92,10 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEUVROWDOWN2BOX_AVX2 +#define HAS_SCALECOLUP2LINEAR_AVX2 +#define HAS_SCALEROWUP2LINEAR_AVX2 +#define HAS_SCALECOLUP2LINEAR_16_AVX2 +#define HAS_SCALEROWUP2LINEAR_16_AVX2 #endif // The following are available on all x86 platforms, but @@ -114,6 +124,10 @@ extern "C" { #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWNEVEN_NEON +#define HAS_SCALECOLUP2LINEAR_NEON +#define HAS_SCALEROWUP2LINEAR_NEON +#define HAS_SCALECOLUP2LINEAR_16_NEON +#define HAS_SCALEROWUP2LINEAR_16_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -279,6 +293,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* d, int dst_width); + +void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + void ScaleCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, @@ -508,6 +556,88 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); + +void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1143,6 +1273,39 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 086738ef..ff3c9dec 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1773 +#define LIBYUV_VERSION 1774 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_from.cc b/source/convert_from.cc index f2cfc1d8..6524f969 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -159,6 +159,102 @@ int I420ToI444(const uint8_t* src_y, dst_uv_height); } +// 420 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I010ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width), + Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I210ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I422ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API int I400Copy(const uint8_t* src_y, diff --git a/source/scale.cc b/source/scale.cc index cf3c0332..34c05699 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1336,6 +1336,238 @@ void ScalePlaneBilinearUp(int src_width, } } +// Scale plane, horizontally 2 times, vertically any time. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of I422 to I444. +void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1); + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale plane, 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of I420 to I444. +void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1); + assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1); + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + if (src_height == 1) { + Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); + } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO test performance of writing one row of destination at a time + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } + } +} + +// Scale at most 14bit plane, horizontally 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I210 to I410 and I212 to I412. +void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1); + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale at most 12bit plane, up 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_16_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1); + assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1); + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; + } +#endif + + if (src_height == 1) { + Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); + } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } + } +} + void ScalePlaneBilinearUp_16(int src_width, int src_height, int dst_width, @@ -1627,6 +1859,17 @@ void ScalePlane(const uint8_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1724,6 +1967,43 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); } +LIBYUV_API +void ScalePlane_12(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + + ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, + dst_width, dst_height, filtering); +} + // Scale an I420 image. // This function in turn calls a scaling function for each plane. diff --git a/source/scale_any.cc b/source/scale_any.cc index c93d70c5..5fd27ae6 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -609,6 +609,191 @@ CANY(ScaleARGBFilterCols_Any_MSA, #endif #undef CANY +// Scale up horizontally 2 times using linear filter. +#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 1, n); \ + } \ + C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ + } \ + dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \ + } + +// Even the C version need to be wrapped, because boundary pixels have to +// be handled differently + +SUH2LANY(ScaleRowUp2_Linear_Any_C, + ScaleRowUp2_Linear_C, + ScaleRowUp2_Linear_C, + 0, + uint8_t) + +SUH2LANY(ScaleRowUp2_Linear_16_Any_C, + ScaleRowUp2_Linear_16_C, + ScaleRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALECOLUP2LINEAR_SSE2 +SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, + ScaleRowUp2_Linear_SSE2, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, + ScaleRowUp2_Linear_SSSE3, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_AVX2 +SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, + ScaleRowUp2_Linear_AVX2, + ScaleRowUp2_Linear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, + ScaleRowUp2_Linear_16_AVX2, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_NEON +SUH2LANY(ScaleRowUp2_Linear_Any_NEON, + ScaleRowUp2_Linear_NEON, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_NEON +SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, + ScaleRowUp2_Linear_16_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#undef SUH2LANY + +// Scale up 2 times using bilinear filter. +// This function produces 2 rows at a time +#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0]) >> 2; \ + db[0] = (sa[0] + 3 * sb[0]) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 1, db - da, n); \ + } \ + C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ + } \ + da[dst_width - 1] = \ + (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \ + db[dst_width - 1] = \ + (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \ + } + +SU2BLANY(ScaleRowUp2_Bilinear_Any_C, + ScaleRowUp2_Bilinear_C, + ScaleRowUp2_Bilinear_C, + 0, + uint8_t) + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, + ScaleRowUp2_Bilinear_16_C, + ScaleRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, + ScaleRowUp2_Bilinear_SSE2, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, + ScaleRowUp2_Bilinear_SSSE3, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, + ScaleRowUp2_Bilinear_AVX2, + ScaleRowUp2_Bilinear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, + ScaleRowUp2_Bilinear_16_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_NEON +SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, + ScaleRowUp2_Bilinear_NEON, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_NEON +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, + ScaleRowUp2_Bilinear_16_NEON, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#undef SU2BLANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_common.cc b/source/scale_common.cc index 81959925..f53e2de9 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, } } +// sample position: (O is src sample position, X is dst sample position) +// +// v dst_ptr at here v stop at here +// X O X X O X X O X X O X X O X +// ^ src_ptr at here +void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// sample position: (O is src sample position, X is dst sample position) +// +// src_ptr at here +// X v X X X X X X X X X +// O O O O O +// X X X X X X X X X X +// ^ dst_ptr at here ^ stop at here +// X X X X X X X X X X +// O O O O O +// X X X X X X X X X X +void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// only suitable for at most 14bit range. +void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Only suitable for at most 12bit range. +void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + // Scales a single row of pixels using point sampling. void ScaleCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index e575ee18..cfbbba98 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -785,6 +785,836 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "xmm7"); } +#ifdef HAS_SCALECOLUP2LINEAR_SSE2 +void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $1,%%xmm6 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm6,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) + "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm6,%%xmm1 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + LABELALIGN + "1: \n" + "pxor %%xmm0,%%xmm0 \n" // 0 + // above line + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" // near+far + "movdqa %%xmm3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" // 2*near + "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + // below line + "movq (%0,%3),%%xmm6 \n" // 01234567 + "movq 1(%0,%3),%%xmm2 \n" // 12345678 + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm6,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) + "paddw %%xmm7,%%xmm5 \n" // near+far + "movdqa %%xmm3,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) + "paddw %%xmm7,%%xmm7 \n" // 2*near + "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) + + "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm6,%%xmm2 \n" // near+far + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) + + // xmm4 xmm1 + // xmm5 xmm2 + "pcmpeqw %%xmm0,%%xmm0 \n" + "psrlw $15,%%xmm0 \n" + "psllw $3,%%xmm0 \n" // all 8 + + "movdqa %%xmm4,%%xmm3 \n" + "movdqa %%xmm5,%%xmm6 \n" + "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + + "movdqa %%xmm1,%%xmm7 \n" + "movdqa %%xmm2,%%xmm6 \n" + "psllw $1,%%xmm7 \n" // 6*near+2*far (1, hi) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) + "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm7 \n" // ^ div by 16 + + "packuswb %%xmm7,%%xmm3 \n" + "movdqu %%xmm3,(%1) \n" // save above line + + "movdqa %%xmm5,%%xmm3 \n" + "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) + "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 + + "movdqa %%xmm2,%%xmm3 \n" + "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) + "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) + "psrlw $4,%%xmm2 \n" // ^ div by 16 + + "packuswb %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // save below line + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $1,%%xmm6 \n" // all 2 + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm2 \n" // 12345678 (16) + "movdqa %%xmm1,%%xmm4 \n" + "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) + "psllw $1,%%xmm5 \n" + "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) + "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) + "movdqu %%xmm5,(%1) \n" + + "movdqa %%xmm1,%%xmm3 \n" + "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) + "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16) + "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "paddw %%xmm6,%%xmm1 \n" + "psllw $1,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "movdqu %%xmm1,0x10(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" + "psllw $3,%%xmm7 \n" // all 8 + + LABELALIGN + "1: \n" + // above line + "movdqu (%0),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm2 \n" // 12345678 (16) + "movdqa %%xmm1,%%xmm4 \n" + "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) + + "movdqa %%xmm1,%%xmm3 \n" + "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) + "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16) + "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + // below line + "movdqu (%0,%3,2),%%xmm6 \n" // 01234567 (16) + "movdqu 2(%0,%3,2),%%xmm2 \n" // 12345678 (16) + "movdqa %%xmm6,%%xmm3 \n" + "punpcklwd %%xmm3,%%xmm3 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm3 \n" + "movdqa %%xmm6,%%xmm5 \n" + "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm3,%%xmm5 \n" // 3*near+far (2, lo) + + "movdqa %%xmm6,%%xmm3 \n" + "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) + "punpckhwd %%xmm6,%%xmm6 \n" // 44556677 (16) + "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm6,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) + + // xmm4 xmm1 + // xmm5 xmm2 + + "movdqa %%xmm4,%%xmm3 \n" + "movdqa %%xmm5,%%xmm6 \n" + "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + "movdqu %%xmm3,(%1) \n" + + "movdqa %%xmm1,%%xmm3 \n" + "movdqa %%xmm2,%%xmm6 \n" + "psllw $1,%%xmm3 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm1,%%xmm3 \n" // 9*near+3*far (1, hi) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + "movdqu %%xmm3,0x10(%1) \n" + + "movdqa %%xmm5,%%xmm3 \n" + "paddw %%xmm7,%%xmm4 \n" // 3*near+far+8 (1, lo) + "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 + "movdqu %%xmm5,(%1,%4,2) \n" + + "movdqa %%xmm2,%%xmm3 \n" + "paddw %%xmm7,%%xmm1 \n" // 3*near+far+8 (1, hi) + "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm2 \n" // ^ div by 16 + "movdqu %%xmm2,0x10(%1,%4,2) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_SSSE3 +static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + +void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqu %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n" + "vmovdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31_SSSE3) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqu %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 1(%0,%3),%%xmm4 \n" + "punpcklwd %%xmm1,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm4 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm4,%%xmm3 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31_SSSE3) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_AVX2 +static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, + 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, + 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; + +void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vmovdqu %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vmovdqu %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2 +static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + + "vmovdqu %3,%%ymm3 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 + "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 + "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 + "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 + "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo) + "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31_16_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} + +// This version can handle full 16bit range but is slower +void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + + "vmovdqu %3,%%ymm3 \n" + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 + "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 + "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 + "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 + "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo) + "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31_16_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + "vmovdqu %5,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 + "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 + "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 + "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 + "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo) + "vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1) + + "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 + "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 + "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878 + "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 + "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo) + "vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi) + "vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2) + + "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) + "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1) \n" // store above + + "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) + "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) + "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31_16_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +// This version can handle full 16bit range but is slower. +void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + + "vmovdqu %5,%%ymm7 \n" + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 + "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 + "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 + "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 + "vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo) + "vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000 + "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767 + "vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878 + "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878 + "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656 + "vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo) + "vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackssdw %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackssdw %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31_16_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + // Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, @@ -946,8 +1776,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, "x"(kFsub80), // %8 "x"(kFadd40) // %9 #else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 20e5b9af..51061655 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -504,6 +504,200 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.8 {d0}, [%0]! \n" // 01234567 + "vld1.8 {d2}, [%3]! \n" // 12345678 + + "vmovl.u8 q0, d0 \n" // 01234567 (16b) + "vmovl.u8 q1, d2 \n" // 12345678 (16b) + "vmovq q2, q0 \n" + "vmla.u16 q2, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q0, q15 \n" // 3*near+far (even) + + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.8 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.8 {d0}, [%0]! \n" // 01234567 + "vld1.8 {d2}, [%5]! \n" // 12345678 + + "vmovl.u8 q0, d0 \n" // 01234567 (16b) + "vmovl.u8 q1, d2 \n" // 12345678 (16b) + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even) + + "vld1.8 {d4}, [%1]! \n" // 01234567 + "vld1.8 {d6}, [%6]! \n" // 12345678 + + "vmovl.u8 q2, d4 \n" // 01234567 (16b) + "vmovl.u8 q3, d6 \n" // 12345678 (16b) + "vmovq q4, q2 \n" + "vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd) + "vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.8 {d0, d1}, [%2]! \n" // store + "vst2.8 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q15" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + + "vmov.u16 q15, #3 \n" + + "1: \n" + "add %5, %0, #2 \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "add %5, %1, #2 \n" + "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) + "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) + + "vmovq q4, q2 \n" + "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) + "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + "vrshr.u16 q2, q1, #4 \n" // 2, even + "vrshr.u16 q3, q0, #4 \n" // 2, odd + "vrshr.u16 q0, q5, #4 \n" // 1, even + "vrshr.u16 q1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store + "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q15" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 185591cb..514dde4c 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -535,6 +535,196 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "v19", "v30", "v31", "memory", "cc"); } +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%1], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%2], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.8h, v4.8h \n" + "mov v1.8h, v5.8h \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 + "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v2.8h, v0.8h \n" + "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) + "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) + + "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) + "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v0.8h, v2.8h \n" + "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) + "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) + "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "mov v0.8h, v4.8h \n" + "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) + "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "mov v0.8h, v4.8h \n" + "mov v1.8h, v5.8h \n" + "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) + + "urshr v2.8h, v2.8h, #4 \n" // 2, odd + "urshr v1.8h, v3.8h, #4 \n" // 2, even + "urshr v4.8h, v4.8h, #4 \n" // 1, odd + "urshr v3.8h, v5.8h, #4 \n" // 1, even + + "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 + "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 + + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index c4ee33b1..c180811a 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -49,7 +49,8 @@ namespace libyuv { #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ + SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ @@ -81,6 +82,16 @@ namespace libyuv { MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ + SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \ + SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \ + SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ + } \ + for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \ + src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ + src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ + } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ @@ -89,9 +100,7 @@ namespace libyuv { memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ - reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \ - reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \ - reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \ + src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ reinterpret_cast<DST_T*>(dst_y_c), kWidth, \ reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \ reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \ @@ -99,9 +108,7 @@ namespace libyuv { MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ - reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \ - reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \ - reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \ + src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \ reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \ reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \ @@ -127,34 +134,39 @@ namespace libyuv { #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y) \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ + benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 1) \ + benchmark_width_, _Unaligned, +, 1, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ + benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) - -TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2) -TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2) -TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2) -TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1) -TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1) -TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2) -TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1) -TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1) -TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2) -TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2) -TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2) -TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2) -TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2) -TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2) + benchmark_width_, _Opt, +, 0, SRC_DEPTH) + +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) +TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8) +TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8) +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8) +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8) +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8) +TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8) +TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8) +TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8) +TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10) +TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10) +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8) +TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10) +TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10) +TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8) +TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10) +TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10) +TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12) +TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12) // Test Android 420 to I420 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ |