aboutsummaryrefslogtreecommitdiff
path: root/source/row_common.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-06-28 16:31:22 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2022-06-29 00:00:46 +0000
commit6900494d90ae095d44405cd4cc3f346971fa69c9 (patch)
tree4a77f6084ee5c3f6b6a2d1d2de7587111175f8fe /source/row_common.cc
parentfe4a50df8e2a787e2919a8321dbe1412b94b20c6 (diff)
downloadlibyuv-6900494d90ae095d44405cd4cc3f346971fa69c9.tar.gz
Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON
MergeRGB and SplitRGB use a register to point to 9 shuffle tables. - fixes an out of registers error with -mcmodel=large InterpolateRow_16To8_NEON improves performance for I210ToI420: On Pixel 4 for 720p x1000 images Was I210ToI420_Opt (608 ms) Now I210ToI420_Opt (336 ms) On Skylake Xeon Was I210ToI420_Opt (259 ms) Now I210ToI420_Opt (209 ms) Bug: libyuv:931, libyuv:930 Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
Diffstat (limited to 'source/row_common.cc')
-rw-r--r--source/row_common.cc128
1 files changed, 71 insertions, 57 deletions
diff --git a/source/row_common.cc b/source/row_common.cc
index 2c9a35f4..150f48db 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
void Convert16To8Row_C(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
@@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
assert(scale <= 32768);
for (x = 0; x < width; ++x) {
- dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+ dst_y[x] = C16TO8(src_y[x], scale);
}
}
@@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
int width) {
int x;
for (x = 0; x < width; ++x) {
- dst_uv[x] = clamp255(
- (((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16);
+ dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
}
}
@@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
return;
@@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr,
HalfRow_C(src_ptr, src_stride, dst_ptr, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
+ for (x = 0; x < width; ++x) {
dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
- dst_ptr[1] =
- (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
- if (width & 1) {
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
+ if (y1_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width * 2);
+ return;
+ }
+ if (y1_fraction == 128) {
+ HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width; ++x) {
dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
}
@@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+
void InterpolateRow_16To8_C(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
@@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
if (source_y_fraction == 0) {
Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
return;
@@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = clamp255(
- (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
- scale) >>
- 16);
- dst_ptr[1] = clamp255(
- (((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) *
- scale) >>
- 16);
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
- dst_ptr[0] = clamp255(
- (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
- scale) >>
- 16);
- }
-}
-
-void InterpolateRow_16_C(uint16_t* dst_ptr,
- const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- int width,
- int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- const uint16_t* src_ptr1 = src_ptr + src_stride;
- int x;
- if (source_y_fraction == 0) {
- memcpy(dst_ptr, src_ptr, width * 2);
- return;
- }
- if (source_y_fraction == 128) {
- HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
- return;
- }
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] = C16TO8(
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+ scale);
+ src_ptr += 1;
+ src_ptr1 += 1;
+ dst_ptr += 1;
}
}
@@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
}
#endif // HAS_RAWTOYJROW_SSSE3
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction) {
+ // Row buffer for intermediate 16 bit pixels.
+ SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
+ Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
+ src_ptr += twidth;
+ dst_ptr += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_INTERPOLATEROW_16TO8_AVX2
+
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;