diff options
Diffstat (limited to 'files/source/scale.cc')
-rw-r--r-- | files/source/scale.cc | 852 |
1 files changed, 749 insertions, 103 deletions
diff --git a/files/source/scale.cc b/files/source/scale.cc index ab085496..e1335f1e 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -17,6 +17,7 @@ #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" // For UVScale #ifdef __cplusplus namespace libyuv { @@ -28,6 +29,7 @@ static __inline int Abs(int v) { } #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Scale plane, 1/2 // This is an optimized version for scaling down a plane to 1/2 of @@ -49,7 +51,7 @@ static void ScalePlaneDown2(int src_width, ? ScaleRowDown2_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); - int row_stride = src_stride << 1; + int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { @@ -118,18 +120,18 @@ static void ScalePlaneDown2(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { +#if defined(HAS_SCALEROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { ScaleRowDown2 = filtering == kFilterNone - ? ScaleRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI - : ScaleRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI + ? ScaleRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX + : ScaleRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX : (filtering == kFilterLinear - ? ScaleRowDown2Linear_MMI - : ScaleRowDown2Box_MMI); + ? ScaleRowDown2Linear_LSX + : ScaleRowDown2Box_LSX); } } #endif @@ -161,7 +163,7 @@ static void ScalePlaneDown2_16(int src_width, ? ScaleRowDown2_16_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : ScaleRowDown2Box_16_C); - int row_stride = src_stride << 1; + int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { @@ -184,14 +186,6 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SSE2); } #endif -#if defined(HAS_SCALEROWDOWN2_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_16_MMI - : ScaleRowDown2Box_16_MMI); - } -#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -221,7 +215,7 @@ static void ScalePlaneDown4(int src_width, void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; - int row_stride = src_stride << 2; + int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { @@ -264,12 +258,12 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_MMI) - if (TestCpuFlag(kCpuHasMMI)) { +#if defined(HAS_SCALEROWDOWN4_LSX) + if (TestCpuFlag(kCpuHasLSX)) { ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; + filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX; } } #endif @@ -297,7 +291,7 @@ static void ScalePlaneDown4_16(int src_width, void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; - int row_stride = src_stride << 2; + int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { @@ -316,11 +310,6 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif -#if defined(HAS_SCALEROWDOWN4_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; - } -#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -398,6 +387,26 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_Any_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_LSX; + ScaleRowDown34_1 = ScaleRowDown34_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -613,6 +622,26 @@ static void ScalePlaneDown38(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN38_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_Any_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_LSX; + ScaleRowDown38_2 = ScaleRowDown38_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX; + } + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -886,11 +915,11 @@ static void ScalePlaneBox(int src_width, } } #endif -#if defined(HAS_SCALEADDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleAddRow = ScaleAddRow_Any_MMI; - if (IS_ALIGNED(src_width, 8)) { - ScaleAddRow = ScaleAddRow_MMI; +#if defined(HAS_SCALEADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleAddRow = ScaleAddRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_LSX; } } #endif @@ -898,7 +927,7 @@ static void ScalePlaneBox(int src_width, for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint8_t* src = src_ptr + iy * src_stride; + const uint8_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; @@ -949,15 +978,10 @@ static void ScalePlaneBox_16(int src_width, } #endif -#if defined(HAS_SCALEADDROW_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { - ScaleAddRow = ScaleAddRow_16_MMI; - } -#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint16_t* src = src_ptr + iy * src_stride; + const uint16_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; @@ -1038,11 +1062,11 @@ void ScalePlaneBilinearDown(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -1068,13 +1092,21 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -1123,7 +1155,7 @@ void ScalePlaneBilinearDown_16(int src_width, #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; + InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } @@ -1131,7 +1163,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; + InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } @@ -1139,7 +1171,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; + InterpolateRow = InterpolateRow_16_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } @@ -1147,7 +1179,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; + InterpolateRow = InterpolateRow_16_Any_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } @@ -1165,7 +1197,7 @@ void ScalePlaneBilinearDown_16(int src_width, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -1258,6 +1290,14 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) @@ -1265,11 +1305,6 @@ void ScalePlaneBilinearUp(int src_width, ScaleFilterCols = ScaleColsUp2_SSE2; } #endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_MMI; - } -#endif } if (y > max_y) { @@ -1277,7 +1312,7 @@ void ScalePlaneBilinearUp(int src_width, } { int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; @@ -1292,7 +1327,9 @@ void ScalePlaneBilinearUp(int src_width, src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; + if (src_height > 2) { + src += src_stride; + } for (j = 0; j < dst_height; ++j) { yi = y >> 16; @@ -1300,14 +1337,16 @@ void ScalePlaneBilinearUp(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_ptr + yi * src_stride; + src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; - src += src_stride; + if ((y + 65536) < max_y) { + src += src_stride; + } } } if (filtering == kFilterLinear) { @@ -1323,6 +1362,327 @@ void ScalePlaneBilinearUp(int src_width, } } +// Scale plane, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of I422 to I444. +void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of I420 to I444. +void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale at most 14 bit plane, horizontally up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I210 to I410 and I212 to I412. +void ScalePlaneUp2_12_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale at most 12 bit plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +void ScalePlaneUp2_16_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + void ScalePlaneBilinearUp_16(int src_width, int src_height, int dst_width, @@ -1351,7 +1711,7 @@ void ScalePlaneBilinearUp_16(int src_width, #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; + InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } @@ -1359,7 +1719,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; + InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } @@ -1367,7 +1727,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; + InterpolateRow = InterpolateRow_16_Any_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } @@ -1375,7 +1735,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; + InterpolateRow = InterpolateRow_16_Any_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } @@ -1397,19 +1757,13 @@ void ScalePlaneBilinearUp_16(int src_width, ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_MMI; - } -#endif } - if (y > max_y) { y = max_y; } { int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; @@ -1424,7 +1778,9 @@ void ScalePlaneBilinearUp_16(int src_width, src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; + if (src_height > 2) { + src += src_stride; + } for (j = 0; j < dst_height; ++j) { yi = y >> 16; @@ -1432,14 +1788,16 @@ void ScalePlaneBilinearUp_16(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_ptr + yi * src_stride; + src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; - src += src_stride; + if ((y + 65536) < max_y) { + src += src_stride; + } } } if (filtering == kFilterLinear) { @@ -1487,15 +1845,11 @@ static void ScalePlaneSimple(int src_width, ScaleCols = ScaleColsUp2_SSE2; } #endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_MMI; - } -#endif } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } @@ -1528,15 +1882,11 @@ static void ScalePlaneSimple_16(int src_width, ScaleCols = ScaleColsUp2_16_SSE2; } #endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_MMI; - } -#endif } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } @@ -1544,7 +1894,6 @@ static void ScalePlaneSimple_16(int src_width, // Scale a plane. // This function dispatches to a specialized scaler based on scale factor. - LIBYUV_API void ScalePlane(const uint8_t* src, int src_stride, @@ -1562,10 +1911,9 @@ void ScalePlane(const uint8_t* src, // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * src_stride; + src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } - // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { @@ -1574,10 +1922,19 @@ void ScalePlane(const uint8_t* src, return; } if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { @@ -1614,6 +1971,17 @@ void ScalePlane(const uint8_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1645,10 +2013,9 @@ void ScalePlane_16(const uint16_t* src, // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * src_stride; + src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } - // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { @@ -1657,10 +2024,22 @@ void ScalePlane_16(const uint16_t* src, return; } if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled vertically. + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + // When scaling up, ensure the last row of destination uses the last + // source. Avoid divide by zero for dst_height but will do no scaling + // later. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { @@ -1697,6 +2076,17 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1711,6 +2101,43 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); } +LIBYUV_API +void ScalePlane_12(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + + ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, + dst_width, dst_height, filtering); +} + // Scale an I420 image. // This function in turn calls a scaling function for each plane. @@ -1736,7 +2163,8 @@ int I420Scale(const uint8_t* src_y, int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1773,7 +2201,8 @@ int I420Scale_16(const uint16_t* src_y, int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1788,6 +2217,44 @@ int I420Scale_16(const uint16_t* src_y, return 0; } +LIBYUV_API +int I420Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + // Scale an I444 image. // This function in turn calls a scaling function for each plane. @@ -1809,7 +2276,7 @@ int I444Scale(const uint8_t* src_y, int dst_width, int dst_height, enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1842,7 +2309,7 @@ int I444Scale_16(const uint16_t* src_y, int dst_width, int dst_height, enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1857,6 +2324,185 @@ int I444Scale_16(const uint16_t* src_y, return 0; } +LIBYUV_API +int I444Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +// Scale an I422 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I422Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +// Scale an NV12 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int NV12Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_uv || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, + dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + // Deprecated api LIBYUV_API int Scale(const uint8_t* src_y, |