From 6900494d90ae095d44405cd4cc3f346971fa69c9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 28 Jun 2022 16:31:22 -0700 Subject: Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON MergeRGB and SplitRGB use a register to point to 9 shuffle tables. - fixes an out of registers error with -mcmodel=large InterpolateRow_16To8_NEON improves performance for I210ToI420: On Pixel 4 for 720p x1000 images Was I210ToI420_Opt (608 ms) Now I210ToI420_Opt (336 ms) On Skylake Xeon Was I210ToI420_Opt (259 ms) Now I210ToI420_Opt (209 ms) Bug: libyuv:931, libyuv:930 Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054 Commit-Queue: Frank Barchard Reviewed-by: Justin Green --- source/row_common.cc | 128 ++++++++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 57 deletions(-) (limited to 'source/row_common.cc') diff --git a/source/row_common.cc b/source/row_common.cc index 2c9a35f4..150f48db 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, @@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y, assert(scale <= 32768); for (x = 0; x < width; ++x) { - dst_y[x] = clamp255((src_y[x] * scale) >> 16); + dst_y[x] = C16TO8(src_y[x], scale); } } @@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv, int width) { int x; for (x = 0; x < width; ++x) { - dst_uv[x] = clamp255( - (((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16); + dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale); } } @@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; @@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr, HalfRow_C(src_ptr, src_stride, dst_ptr, width); return; } - for (x = 0; x < width - 1; x += 2) { + for (x = 0; x < width; ++x) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; - dst_ptr[1] = - (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } - if (width & 1) { +} + +// C version 2x2 -> 2x1. +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (y1_fraction == 128) { + HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); + return; + } + for (x = 0; x < width; ++x) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } } @@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +// TODO(fbarchard): change scale to bits + void InterpolateRow_16To8_C(uint8_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, @@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + if (source_y_fraction == 0) { Convert16To8Row_C(src_ptr, dst_ptr, scale, width); return; @@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width); return; } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = clamp255( - (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) * - scale) >> - 16); - dst_ptr[1] = clamp255( - (((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) * - scale) >> - 16); - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = clamp255( - (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) * - scale) >> - 16); - } -} - -void InterpolateRow_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - int x; - if (source_y_fraction == 0) { - memcpy(dst_ptr, src_ptr, width * 2); - return; - } - if (source_y_fraction == 128) { - HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); - return; - } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + for (x = 0; x < width; ++x) { + dst_ptr[0] = C16TO8( + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, + scale); + src_ptr += 1; + src_ptr1 += 1; + dst_ptr += 1; } } @@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { } #endif // HAS_RAWTOYJROW_SSSE3 +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + // Row buffer for intermediate 16 bit pixels. + SIMD_ALIGNED(uint16_t row[MAXTWIDTH]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction); + Convert16To8Row_AVX2(row, dst_ptr, scale, twidth); + src_ptr += twidth; + dst_ptr += twidth; + width -= twidth; + } +} +#endif // HAS_INTERPOLATEROW_16TO8_AVX2 + float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; -- cgit v1.2.3