aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuan Tong <tongyuan200097@gmail.com>2021-02-03 14:21:07 +0800
committerFrank Barchard <fbarchard@chromium.org>2021-02-03 10:53:02 +0000
commitfc61dde1eb4b7807201fa20cd0a7d023363558b2 (patch)
tree9cb82fea30f1da03c77e51d4f8f8c6ed6d9d6024
parentc28d4049364d75710b1c49697a5814ab572af641 (diff)
downloadlibyuv-fc61dde1eb4b7807201fa20cd0a7d023363558b2.tar.gz
Add special optimization for I420ToI444 and I422ToI444
These functions use (bi)linear filter, to scale U and V planes to the size of Y plane. This will help enhance the quality of YUV to RGB conversion. Also added 10bit and 12bit version: I010ToI410 I210ToI410 I012ToI412 I212ToI412 libyuv_unittest --gtest_filter=LibYUVConvertTest.I42*ToI444*:LibYUVConvertTest.I*1*ToI41* R=fbarchard@chromium.org Change-Id: Ie4a711a5ba28f2ff1f44c021f7a5c149022264c5 Bug: libyuv:872 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2658097 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/convert.h74
-rw-r--r--include/libyuv/scale.h12
-rw-r--r--include/libyuv/scale_row.h163
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/convert_from.cc96
-rw-r--r--source/scale.cc280
-rw-r--r--source/scale_any.cc185
-rw-r--r--source/scale_common.cc89
-rw-r--r--source/scale_gcc.cc834
-rw-r--r--source/scale_neon.cc194
-rw-r--r--source/scale_neon64.cc190
-rw-r--r--unit_test/convert_test.cc66
13 files changed, 2156 insertions, 31 deletions
diff --git a/README.chromium b/README.chromium
index f49e57c8..d27d1aa3 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1772
+Version: 1774
License: BSD
License File: LICENSE
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 026b153c..50ffc2f0 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -89,6 +89,23 @@ int I422ToI420(const uint8_t* src_y,
int width,
int height);
+// Convert I422 to I444.
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
// Convert I422 to NV21.
LIBYUV_API
int I422ToNV21(const uint8_t* src_y,
@@ -122,6 +139,23 @@ int I420Copy(const uint8_t* src_y,
int width,
int height);
+// Convert I420 to I444.
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
// Copy I010 to I010
#define I010ToI010 I010Copy
#define H010ToH010 I010Copy
@@ -159,6 +193,46 @@ int I010ToI420(const uint16_t* src_y,
int width,
int height);
+// Convert I010 to I410
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I012 to I412
+#define I012ToI412 I010ToI410
+
+// Convert I212 to I412
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I212 to I412
+#define I212ToI412 I210ToI410
+
// Convert I400 (grey) to I420.
LIBYUV_API
int I400ToI420(const uint8_t* src_y,
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
index add5a9eb..d06f8b52 100644
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -49,6 +49,18 @@ void ScalePlane_16(const uint16_t* src,
int dst_height,
enum FilterMode filtering);
+// Sample is expected to be in the low 12 bits.
+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index a386d499..ee77d228 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -77,6 +77,12 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALECOLUP2LINEAR_SSE2
+#define HAS_SCALECOLUP2LINEAR_SSSE3
+#define HAS_SCALEROWUP2LINEAR_SSE2
+#define HAS_SCALEROWUP2LINEAR_SSSE3
+#define HAS_SCALECOLUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2LINEAR_16_SSE2
#endif
// The following are available for gcc/clang x86 platforms, but
@@ -86,6 +92,10 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALECOLUP2LINEAR_AVX2
+#define HAS_SCALEROWUP2LINEAR_AVX2
+#define HAS_SCALECOLUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2LINEAR_16_AVX2
#endif
// The following are available on all x86 platforms, but
@@ -114,6 +124,10 @@ extern "C" {
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
+#define HAS_SCALECOLUP2LINEAR_NEON
+#define HAS_SCALEROWUP2LINEAR_NEON
+#define HAS_SCALECOLUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2LINEAR_16_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -279,6 +293,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* d,
int dst_width);
+
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
@@ -508,6 +556,88 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -1143,6 +1273,39 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 086738ef..ff3c9dec 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1773
+#define LIBYUV_VERSION 1774
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert_from.cc b/source/convert_from.cc
index f2cfc1d8..6524f969 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -159,6 +159,102 @@ int I420ToI444(const uint8_t* src_y,
dst_uv_height);
}
+// 420 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+ Abs(height), kFilterBilinear);
+ ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+ Abs(height), kFilterBilinear);
+ return 0;
+}
+
+// 422 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
int I400Copy(const uint8_t* src_y,
diff --git a/source/scale.cc b/source/scale.cc
index cf3c0332..34c05699 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1336,6 +1336,238 @@ void ScalePlaneBilinearUp(int src_width,
}
}
+// Scale plane, horizontally 2 times, vertically any time.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+void ScalePlaneUp2_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+ ScaleRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+void ScalePlaneUp2_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ if (src_height == 1) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+ } else {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO test performance of writing one row of destination at a time
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+ }
+}
+
+// Scale at most 14bit plane, horizontally 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I210 to I410 and I212 to I412.
+void ScalePlaneUp2_16_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale at most 12bit plane, up 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I010 to I410 and I012 to I412.
+void ScalePlaneUp2_16_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ if (src_height == 1) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+ } else {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+ }
+}
+
void ScalePlaneBilinearUp_16(int src_width,
int src_height,
int dst_width,
@@ -1627,6 +1859,17 @@ void ScalePlane(const uint8_t* src,
dst_stride, src, dst);
return;
}
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
if (filtering && dst_height > src_height) {
ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1724,6 +1967,43 @@ void ScalePlane_16(const uint16_t* src,
dst_stride, src, dst);
}
+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+
+ ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+ dst_width, dst_height, filtering);
+}
+
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.
diff --git a/source/scale_any.cc b/source/scale_any.cc
index c93d70c5..5fd27ae6 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -609,6 +609,191 @@ CANY(ScaleARGBFilterCols_Any_MSA,
#endif
#undef CANY
+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 1, n); \
+ } \
+ C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
+ } \
+ dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \
+ }
+
+// Even the C version need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+ ScaleRowUp2_Linear_C,
+ ScaleRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+ ScaleRowUp2_Linear_16_C,
+ ScaleRowUp2_Linear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+ ScaleRowUp2_Linear_SSE2,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+ ScaleRowUp2_Linear_SSSE3,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+ ScaleRowUp2_Linear_16_SSE2,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+ ScaleRowUp2_Linear_AVX2,
+ ScaleRowUp2_Linear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+ ScaleRowUp2_Linear_16_AVX2,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_NEON
+SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
+ ScaleRowUp2_Linear_NEON,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
+SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
+ ScaleRowUp2_Linear_16_NEON,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0]) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0]) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 1, db - da, n); \
+ } \
+ C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
+ } \
+ da[dst_width - 1] = \
+ (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \
+ db[dst_width - 1] = \
+ (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \
+ }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+ ScaleRowUp2_Bilinear_C,
+ ScaleRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+ ScaleRowUp2_Bilinear_16_C,
+ ScaleRowUp2_Bilinear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+ ScaleRowUp2_Bilinear_SSE2,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+ ScaleRowUp2_Bilinear_16_SSE2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+ ScaleRowUp2_Bilinear_SSSE3,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+ ScaleRowUp2_Bilinear_AVX2,
+ ScaleRowUp2_Bilinear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+ ScaleRowUp2_Bilinear_16_AVX2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
+ ScaleRowUp2_Bilinear_NEON,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
+ ScaleRowUp2_Bilinear_16_NEON,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SU2BLANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 81959925..f53e2de9 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
+// sample position: (O is src sample position, X is dst sample position)
+//
+// v dst_ptr at here v stop at here
+// X O X X O X X O X X O X X O X
+// ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// sample position: (O is src sample position, X is dst sample position)
+//
+// src_ptr at here
+// X v X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+// ^ dst_ptr at here ^ stop at here
+// X X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
+// only suitable for at most 14bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ uint16_t* d = dst_ptr;
+ uint16_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index e575ee18..cfbbba98 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -785,6 +785,836 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"xmm7");
}
+#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $1,%%xmm6 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "paddw %%xmm6,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
+ "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ // above line
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n" // near+far
+ "movdqa %%xmm3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n" // 2*near
+ "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movq (%0,%3),%%xmm6 \n" // 01234567
+ "movq 1(%0,%3),%%xmm2 \n" // 12345678
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm6,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
+ "paddw %%xmm7,%%xmm5 \n" // near+far
+ "movdqa %%xmm3,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
+ "paddw %%xmm7,%%xmm7 \n" // 2*near
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
+
+ "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm6,%%xmm2 \n" // near+far
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
+
+ // xmm4 xmm1
+ // xmm5 xmm2
+ "pcmpeqw %%xmm0,%%xmm0 \n"
+ "psrlw $15,%%xmm0 \n"
+ "psllw $3,%%xmm0 \n" // all 8
+
+ "movdqa %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+
+ "movdqa %%xmm1,%%xmm7 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "psllw $1,%%xmm7 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm7 \n" // ^ div by 16
+
+ "packuswb %%xmm7,%%xmm3 \n"
+ "movdqu %%xmm3,(%1) \n" // save above line
+
+ "movdqa %%xmm5,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
+ "psrlw $4,%%xmm2 \n" // ^ div by 16
+
+ "packuswb %%xmm2,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // save below line
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $1,%%xmm6 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
+ "psllw $1,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
+ "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
+ "movdqu %%xmm5,(%1) \n"
+
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
+ "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
+ "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "paddw %%xmm6,%%xmm1 \n"
+ "psllw $1,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+ "movdqu %%xmm1,0x10(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ "pcmpeqw %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n"
+ "psllw $3,%%xmm7 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+ // above line
+ "movdqu (%0),%%xmm1 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n"
+ "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
+
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
+ "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
+ "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movdqu (%0,%3,2),%%xmm6 \n" // 01234567 (16)
+ "movdqu 2(%0,%3,2),%%xmm2 \n" // 12345678 (16)
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklwd %%xmm3,%%xmm3 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm5 \n"
+ "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n"
+ "paddw %%xmm3,%%xmm5 \n" // 3*near+far (2, lo)
+
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
+ "punpckhwd %%xmm6,%%xmm6 \n" // 44556677 (16)
+ "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm6,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
+
+ // xmm4 xmm1
+ // xmm5 xmm2
+
+ "movdqa %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+ "movdqu %%xmm3,(%1) \n"
+
+ "movdqa %%xmm1,%%xmm3 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm1,%%xmm3 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+ "movdqu %%xmm3,0x10(%1) \n"
+
+ "movdqa %%xmm5,%%xmm3 \n"
+ "paddw %%xmm7,%%xmm4 \n" // 3*near+far+8 (1, lo)
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16
+ "movdqu %%xmm5,(%1,%4,2) \n"
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "paddw %%xmm7,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm2 \n" // ^ div by 16
+ "movdqu %%xmm2,0x10(%1,%4,2) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
+ 3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqu %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31_SSSE3) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqu %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 1(%0,%3),%%xmm4 \n"
+ "punpcklwd %%xmm1,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm4,%%xmm3 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31_SSSE3) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
+ 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
+ 1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vmovdqu %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vmovdqu %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
+ 3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "vmovdqu %3,%%ymm3 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
+ "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31_16_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// This version can handle full 16bit range but is slower
+void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+
+ "vmovdqu %3,%%ymm3 \n"
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
+ "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31_16_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ "vmovdqu %5,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
+
+ "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
+ "vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
+ "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1) \n" // store above
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
+ "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
+ "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31_16_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+// This version can handle full 16bit range but is slower.
+void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+
+ "vmovdqu %5,%%ymm7 \n"
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
+ "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
+ "vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
+ "vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
+ "vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31_16_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
@@ -946,8 +1776,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
- "m"(kFsub80), // %8
- "m"(kFadd40) // %9
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 20e5b9af..51061655 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -504,6 +504,200 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
}
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // 01234567
+ "vld1.8 {d2}, [%3]! \n" // 12345678
+
+ "vmovl.u8 q0, d0 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d2 \n" // 12345678 (16b)
+ "vmovq q2, q0 \n"
+ "vmla.u16 q2, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q0, q15 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.8 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // 01234567
+ "vld1.8 {d2}, [%5]! \n" // 12345678
+
+ "vmovl.u8 q0, d0 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d2 \n" // 12345678 (16b)
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d4}, [%1]! \n" // 01234567
+ "vld1.8 {d6}, [%6]! \n" // 12345678
+
+ "vmovl.u8 q2, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q3, d6 \n" // 12345678 (16b)
+ "vmovq q4, q2 \n"
+ "vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd)
+ "vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.8 {d0, d1}, [%2]! \n" // store
+ "vst2.8 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "add %5, %0, #2 \n"
+ "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "add %5, %1, #2 \n"
+ "vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
+ "vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
+
+ "vmovq q4, q2 \n"
+ "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ "vrshr.u16 q2, q1, #4 \n" // 2, even
+ "vrshr.u16 q3, q0, #4 \n" // 2, odd
+ "vrshr.u16 q0, q5, #4 \n" // 1, even
+ "vrshr.u16 q1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
+ "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q15" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 185591cb..514dde4c 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -535,6 +535,196 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc");
}
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%1], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%2], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.8h, v4.8h \n"
+ "mov v1.8h, v5.8h \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
+ "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v2.8h, v0.8h \n"
+ "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
+
+ "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.8h, v2.8h \n"
+ "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
+ "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.8h, v4.8h \n"
+ "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "mov v0.8h, v4.8h \n"
+ "mov v1.8h, v5.8h \n"
+ "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
+
+ "urshr v2.8h, v2.8h, #4 \n" // 2, odd
+ "urshr v1.8h, v3.8h, #4 \n" // 2, even
+ "urshr v4.8h, v4.8h, #4 \n" // 1, odd
+ "urshr v3.8h, v5.8h, #4 \n" // 1, even
+
+ "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
+ "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
+
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v31" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index c4ee33b1..c180811a 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -49,7 +49,8 @@ namespace libyuv {
#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
- DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
+ SRC_DEPTH) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
@@ -81,6 +82,16 @@ namespace libyuv {
MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
+ SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \
+ SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
+ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \
+ src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \
+ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \
+ } \
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
@@ -89,9 +100,7 @@ namespace libyuv {
memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
- reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
- reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
- reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
+ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
@@ -99,9 +108,7 @@ namespace libyuv {
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
- reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
- reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
- reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
+ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
@@ -127,34 +134,39 @@ namespace libyuv {
#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
- DST_SUBSAMP_X, DST_SUBSAMP_Y) \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
+ benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
+ benchmark_width_, _Unaligned, +, 1, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
+ benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
+ benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
// Test Android 420 to I420
#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \