diff options
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/row.h | 26 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/row_any.cc | 102 | ||||
-rw-r--r-- | source/row_common.cc | 128 | ||||
-rw-r--r-- | source/row_gcc.cc | 156 | ||||
-rw-r--r-- | source/row_neon64.cc | 80 | ||||
-rw-r--r-- | source/scale_common.cc | 24 |
8 files changed, 339 insertions, 181 deletions
diff --git a/README.chromium b/README.chromium index e8cc1e8d..56dd4cba 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1831 +Version: 1832 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f15b58fa..1a1cf4b6 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -348,6 +348,7 @@ extern "C" { #define HAS_AR64TOARGBROW_AVX2 #define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 +#define HAS_INTERPOLATEROW_16TO8_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 @@ -539,6 +540,7 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_INTERPOLATEROW_16TO8_NEON #define HAS_SCALESUMSAMPLES_NEON #define HAS_GAUSSROW_F32_NEON #define HAS_GAUSSCOL_F32_NEON @@ -5221,6 +5223,30 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, int scale, int width, int source_y_fraction); +void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction); +void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction); +void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction); +void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction); // Sobel images. void SobelXRow_C(const uint8_t* src_y0, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 45a200b1..8ef3493b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1831 +#define LIBYUV_VERSION 1832 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_any.cc b/source/row_any.cc index 17a7cdde..3781a9f2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1625,47 +1625,101 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11I(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(T* dst_ptr, const T* src_ptr, ptrdiff_t src_stride, int width, \ - int source_y_fraction) { \ - SIMD_ALIGNED(T temp[64 * 3]); \ - memset(temp, 0, 64 * 2 * sizeof(T)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP * sizeof(T)); \ - if (source_y_fraction) { \ - memcpy(temp + 64, src_ptr + src_stride + n * SBPP, \ - r * SBPP * sizeof(T)); \ - } \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP * sizeof(T)); \ +#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS temps[64 * 2]); \ + SIMD_ALIGNED(TD tempd[64]); \ + memset(temps, 0, sizeof(temps)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ } #ifdef HAS_INTERPOLATEROW_AVX2 -ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, 1, 1, 31) +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, uint8_t, 1, 1, 15) +ANY11I(InterpolateRow_Any_SSSE3, + InterpolateRow_SSSE3, + uint8_t, + uint8_t, + 1, + 1, + 15) #endif #ifdef HAS_INTERPOLATEROW_NEON -ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, 1, 1, 15) +ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MSA -ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, 1, 1, 31) +ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_LSX -ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, 1, 1, 31) +ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_16_NEON -ANY11I(InterpolateRow_16_Any_NEON, InterpolateRow_16_NEON, uint16_t, 1, 1, 7) +ANY11I(InterpolateRow_16_Any_NEON, + InterpolateRow_16_NEON, + uint16_t, + uint16_t, + 1, + 1, + 7) #endif - #undef ANY11I +// Any 1 to 1 interpolate with scale param +#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int scale, int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS temps[64 * 2]); \ + SIMD_ALIGNED(TD tempd[64]); \ + memset(temps, 0, sizeof(temps)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \ + } \ + memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ + } + +#ifdef HAS_INTERPOLATEROW_16TO8_NEON +ANY11IS(InterpolateRow_16To8_Any_NEON, + InterpolateRow_16To8_NEON, + uint8_t, + uint16_t, + 1, + 1, + 7) +#endif +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +ANY11IS(InterpolateRow_16To8_Any_AVX2, + InterpolateRow_16To8_AVX2, + uint8_t, + uint16_t, + 1, + 1, + 31) +#endif + +#undef ANY11IS + // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ diff --git a/source/row_common.cc b/source/row_common.cc index 2c9a35f4..150f48db 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, @@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y, assert(scale <= 32768); for (x = 0; x < width; ++x) { - dst_y[x] = clamp255((src_y[x] * scale) >> 16); + dst_y[x] = C16TO8(src_y[x], scale); } } @@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv, int width) { int x; for (x = 0; x < width; ++x) { - dst_uv[x] = clamp255( - (((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16); + dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale); } } @@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; @@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr, HalfRow_C(src_ptr, src_stride, dst_ptr, width); return; } - for (x = 0; x < width - 1; x += 2) { + for (x = 0; x < width; ++x) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; - dst_ptr[1] = - (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } - if (width & 1) { +} + +// C version 2x2 -> 2x1. +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (y1_fraction == 128) { + HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width; ++x) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } } @@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +// TODO(fbarchard): change scale to bits + void InterpolateRow_16To8_C(uint8_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, @@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + if (source_y_fraction == 0) { Convert16To8Row_C(src_ptr, dst_ptr, scale, width); return; @@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width); return; } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = clamp255( - (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) * - scale) >> - 16); - dst_ptr[1] = clamp255( - (((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) * - scale) >> - 16); - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = clamp255( - (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) * - scale) >> - 16); - } -} - -void InterpolateRow_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - int x; - if (source_y_fraction == 0) { - memcpy(dst_ptr, src_ptr, width * 2); - return; - } - if (source_y_fraction == 128) { - HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); - return; - } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + for (x = 0; x < width; ++x) { + dst_ptr[0] = C16TO8( + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, + scale); + src_ptr += 1; + src_ptr1 += 1; + dst_ptr += 1; } } @@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { } #endif // HAS_RAWTOYJROW_SSSE3 +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + // Row buffer for intermediate 16 bit pixels. + SIMD_ALIGNED(uint16_t row[MAXTWIDTH]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction); + Convert16To8Row_AVX2(row, dst_ptr, scale, twidth); + src_ptr += twidth; + dst_ptr += twidth; + width -= twidth; + } +} +#endif // HAS_INTERPOLATEROW_16TO8_AVX2 + float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 85376e4b..dce8c439 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5198,37 +5198,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, #endif // HAS_CONVERT8TO16ROW_AVX2 #ifdef HAS_SPLITRGBROW_SSSE3 - // Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, - 2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u}; - -static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, - 3u, 6u, 9u, 12u, 15u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 2u, - 5u, 8u, 11u, 14u}; - -static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 0u, 3u, - 6u, 9u, 12u, 15u}; +static const uvec8 kSplitRGBShuffle[9] = { + {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u, + 7u, 10u, 13u}, + {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, + 8u, 11u, 14u}, + {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, + 12u, 15u}}; void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_r, @@ -5242,9 +5231,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" + "pshufb 0(%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" @@ -5253,9 +5242,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" + "pshufb 48(%5),%%xmm0 \n" + "pshufb 64(%5),%%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" @@ -5264,9 +5253,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%3) \n" @@ -5279,51 +5268,32 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 - : "m"(kShuffleMaskRGBToR0), // %5 - "m"(kShuffleMaskRGBToR1), // %6 - "m"(kShuffleMaskRGBToR2), // %7 - "m"(kShuffleMaskRGBToG0), // %8 - "m"(kShuffleMaskRGBToG1), // %9 - "m"(kShuffleMaskRGBToG2), // %10 - "m"(kShuffleMaskRGBToB0), // %11 - "m"(kShuffleMaskRGBToB1), // %12 - "m"(kShuffleMaskRGBToB2) // %13 + : "r"(&kSplitRGBShuffle[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_SPLITRGBROW_SSSE3 #ifdef HAS_MERGERGBROW_SSSE3 - -// Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, - 2u, 128u, 128u, 3u, 128u, 128u, - 4u, 128u, 128u, 5u}; -static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, - 128u, 2u, 128u, 128u, 3u, 128u, - 128u, 4u, 128u, 128u}; -static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, - 128u, 128u, 2u, 128u, 128u, 3u, - 128u, 128u, 4u, 128u}; - -static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, - 7u, 128u, 128u, 8u, 128u, 128u, - 9u, 128u, 128u, 10u}; -static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, - 128u, 7u, 128u, 128u, 8u, 128u, - 128u, 9u, 128u, 128u}; -static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, - 128u, 128u, 8u, 128u, 128u, 9u, - 128u, 128u, 10u, 128u}; - -static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, - 12u, 128u, 128u, 13u, 128u, 128u, - 14u, 128u, 128u, 15u}; -static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, - 128u, 13u, 128u, 128u, 14u, 128u, - 128u, 15u, 128u, 128u}; -static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, - 128u, 128u, 13u, 128u, 128u, 14u, - 128u, 128u, 15u, 128u}; +// Shuffle table for converting Planar to RGB. +static const uvec8 kMergeRGBShuffle[9] = { + {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, + 128u, 5u}, + {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, + 128u, 128u}, + {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u}, + {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u, + 10u, 128u}, + {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, + 128u, 10u}, + {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u}, + {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, + 15u, 128u, 128u}, + {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u}, + {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u}}; void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_g, @@ -5337,9 +5307,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" + "pshufb (%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%3) \n" @@ -5347,9 +5317,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" + "pshufb 48(%5), %%xmm0 \n" + "pshufb 64(%5), %%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,16(%3) \n" @@ -5357,9 +5327,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,32(%3) \n" @@ -5375,15 +5345,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, "+r"(src_b), // %2 "+r"(dst_rgb), // %3 "+r"(width) // %4 - : "m"(kShuffleMaskRToRGB0), // %5 - "m"(kShuffleMaskGToRGB0), // %6 - "m"(kShuffleMaskBToRGB0), // %7 - "m"(kShuffleMaskRToRGB1), // %8 - "m"(kShuffleMaskGToRGB1), // %9 - "m"(kShuffleMaskBToRGB1), // %10 - "m"(kShuffleMaskRToRGB2), // %11 - "m"(kShuffleMaskGToRGB2), // %12 - "m"(kShuffleMaskBToRGB2) // %13 + : "r"(&kMergeRGBShuffle[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGERGBROW_SSSE3 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 00adfe3e..a5313b7e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3031,6 +3031,86 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } +// Bilinear filter 8x2 -> 8x1 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + + asm volatile( + "dup v6.8h, %w6 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ldr q0, [%1], #16 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "prfm pldl1keep, [%1, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "str d0, [%0], #8 \n" // store 8 pixels + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction), // %5 + "r"(shift) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, diff --git a/source/scale_common.cc b/source/scale_common.cc index 812d57ec..b02bdafd 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1605,6 +1605,12 @@ void ScalePlaneVertical_16(int src_height, } } +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits void ScalePlaneVertical_16To8(int src_height, int dst_width, int dst_height, @@ -1620,7 +1626,7 @@ void ScalePlaneVertical_16To8(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - // TODO(https://crbug.com/libyuv/931): Add NEON and AVX2 versions. + // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions. void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int scale, int dst_width, int source_y_fraction) = InterpolateRow_16To8_C; @@ -1632,6 +1638,22 @@ void ScalePlaneVertical_16To8(int src_height, assert(dst_height > 0); src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16TO8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow_16To8 = InterpolateRow_16To8_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16TO8_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow_16To8 = InterpolateRow_16To8_AVX2; + } + } +#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; |