diff options
-rw-r--r-- | include/libyuv/scale_row.h | 36 | ||||
-rw-r--r-- | source/scale.cc | 46 | ||||
-rw-r--r-- | source/scale_any.cc | 22 | ||||
-rw-r--r-- | source/scale_common.cc | 44 | ||||
-rw-r--r-- | source/scale_win.cc | 107 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 30 |
6 files changed, 118 insertions, 167 deletions
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index b78a56bc..23b2471f 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -30,13 +30,11 @@ extern "C" { #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 - // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 -#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEARGBFILTERCOLS_SSSE3 @@ -50,17 +48,21 @@ extern "C" { #define HAS_SCALEROWDOWN4_SSE2 #endif -// The following are available on VS2012. +// The following are available on VS2012: #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -#define HAS_SCALEADDROWS_AVX2 +#define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif +// The following are available on Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__) +#define HAS_SCALEADDROW_SSE2 +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SCALEADDROWS_NEON #define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON @@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width); -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height); +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); void ScaleARGBRowDown2_C(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width); @@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); @@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); +void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); diff --git a/source/scale.cc b/source/scale.cc index 5460cc7e..0a01304c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { - int j; + int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height, const uint16* src_ptr, uint8* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_C: ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; -#if defined(HAS_SCALEADDROWS_SSE2) + void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = + ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleAddRows = ScaleAddRows_Any_SSE2; + ScaleAddRow = ScaleAddRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - ScaleAddRows = ScaleAddRows_SSE2; + ScaleAddRow = ScaleAddRow_SSE2; } } #endif -#if defined(HAS_SCALEADDROWS_AVX2) +#if defined(HAS_SCALEADDROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleAddRows = ScaleAddRows_Any_AVX2; + ScaleAddRow = ScaleAddRow_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { - ScaleAddRows = ScaleAddRows_AVX2; + ScaleAddRow = ScaleAddRow_AVX2; } } #endif -#if defined(HAS_SCALEADDROWS_NEON) +#if defined(HAS_SCALEADDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleAddRows = ScaleAddRows_Any_NEON; + ScaleAddRow = ScaleAddRow_Any_NEON; if (IS_ALIGNED(src_width, 16)) { - ScaleAddRows = ScaleAddRows_NEON; + ScaleAddRow = ScaleAddRow_NEON; } } #endif @@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height, y = max_y; } boxheight = MIN1((y >> 16) - iy); - ScaleAddRows(src, src_stride, (uint16*)(row16), src_width, boxheight); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16 *)(row16), src_width); + src += src_stride; + } ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); dst_ptr += dst_stride; } @@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16* src_ptr, uint16* dst_ptr) { - int j; + int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height, void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint32* src_ptr, uint16* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; - void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; + void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = + ScaleAddRow_16_C; -#if defined(HAS_SCALEADDROWS_16_SSE2) +#if defined(HAS_SCALEADDROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { - ScaleAddRows = ScaleAddRows_16_SSE2; + ScaleAddRow = ScaleAddRow_16_SSE2; } #endif @@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height, y = max_y; } boxheight = MIN1((y >> 16) - iy); - ScaleAddRows(src, src_stride, (uint32*)(row32), src_width, boxheight); + memset(row32, 0, src_width * 4); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint32 *)(row32), src_width); + src += src_stride; + } ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); dst_ptr += dst_stride; } diff --git a/source/scale_any.cc b/source/scale_any.cc index b3f2ecf2..2f6a2c8b 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, #endif // Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROWS_SIMD, SCALEADDROWS_C, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint16* dst_ptr, int src_width, int src_height) { \ +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ int n = src_width & ~MASK; \ if (n > 0) { \ - SCALEADDROWS_SIMD(src_ptr, src_stride, dst_ptr, n, src_height); \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ } \ - SCALEADDROWS_C(src_ptr + n, src_stride, \ - dst_ptr + n, src_width & MASK, src_height); \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ } -#ifdef HAS_SCALEADDROWS_SSE2 -SAANY(ScaleAddRows_Any_SSE2, ScaleAddRows_SSE2, ScaleAddRows_C, 15) +#ifdef HAS_SCALEADDROW_SSE2 +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) #endif -#ifdef HAS_SCALEADDROWS_AVX2 -SAANY(ScaleAddRows_Any_AVX2, ScaleAddRows_AVX2, ScaleAddRows_C, 31) +#ifdef HAS_SCALEADDROW_AVX2 +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) #endif -#ifdef HAS_SCALEADDROWS_NEON -SAANY(ScaleAddRows_Any_NEON, ScaleAddRows_NEON, ScaleAddRows_C, 15) +#ifdef HAS_SCALEADDROW_NEON +SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #endif #undef SAANY diff --git a/source/scale_common.cc b/source/scale_common.cc index 014d9566..1711f3d5 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { int x; assert(src_width > 0); - assert(src_height > 0); - for (x = 0; x < src_width; ++x) { - const uint8* s = src_ptr + x; - unsigned int sum = 0u; - int y; - for (y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - // TODO(fbarchard): Consider limiting height to 256 to avoid overflow. - dst_ptr[x] = sum < 65535u ? sum : 65535u; + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; } } -void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { int x; assert(src_width > 0); - assert(src_height > 0); - for (x = 0; x < src_width; ++x) { - const uint16* s = src_ptr + x; - unsigned int sum = 0u; - int y; - for (y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - // No risk of overflow here now - dst_ptr[x] = sum; + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; } } diff --git a/source/scale_win.cc b/source/scale_win.cc index 4246f717..01a81635 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } } -// Reads 16xN bytes and produces 16 shorts at a time. +// Reads 16 bytes and accumulates to 16 shorts at a time. __declspec(naked) -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { __asm { - push esi - push edi - push ebx - push ebp - mov esi, [esp + 16 + 4] // src_ptr - mov edx, [esp + 16 + 8] // src_stride - mov edi, [esp + 16 + 12] // dst_ptr - mov ecx, [esp + 16 + 16] // dst_width - mov ebx, [esp + 16 + 20] // height - mov eax, esi // row pointer - mov ebp, ebx // height - pxor xmm0, xmm0 // clear accumulators - pxor xmm1, xmm1 - pxor xmm4, xmm4 + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + pxor xmm5, xmm5 // sum rows xloop: - movdqu xmm2, [eax] // read 16 pixels - lea eax, [eax + edx] // advance to next row - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 + movdqu xmm3, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm1, [edx + 16] + movdqa xmm2, xmm3 + punpcklbw xmm2, xmm5 + punpckhbw xmm3, xmm5 paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 - sub ebp, 1 - jg xloop - - movdqu [edi], xmm0 - movdqu [edi + 16], xmm1 - lea edi, [edi + 32] // dst_ptr += 16 - lea esi, [esi + 16] // src_ptr += 16 - mov eax, esi // row pointer - mov ebp, ebx // height - pxor xmm0, xmm0 // clear accumulators - pxor xmm1, xmm1 + movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] sub ecx, 16 jg xloop - - pop ebp - pop ebx - pop edi - pop esi ret } } -// Reads 32xN bytes and produces 32 shorts at a time. +// Reads 32 bytes and accumulates to 32 shorts at a time. __declspec(naked) -void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { __asm { - push esi - push edi - push ebx - push ebp - mov esi, [esp + 16 + 4] // src_ptr - mov edx, [esp + 16 + 8] // src_stride - mov edi, [esp + 16 + 12] // dst_ptr - mov ecx, [esp + 16 + 16] // dst_width - mov ebx, [esp + 16 + 20] // height - mov eax, esi // row pointer - mov ebp, ebx // height - vpxor ymm0, ymm0, ymm0 // clear accumulators - vpxor ymm1, ymm1, ymm1 - vpxor ymm4, ymm4, ymm4 + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + vpxor ymm5, ymm5, ymm5 // sum rows xloop: - vmovdqu ymm2, [eax] // read 16 pixels - vpermq ymm2, ymm2, 0xd8 // unmutate for vpunpck - lea eax, [eax + edx] // advance to next row - vpunpckhbw ymm3, ymm2, ymm4 - vpunpcklbw ymm2, ymm2, ymm4 + vmovdqu ymm3, [eax] // read 32 bytes + vpermq ymm3, ymm2, 0xd8 // unmutate for vpunpck + lea eax, [eax + 32] + vmovdqu ymm0, [edx] // read 32 words from destination + vmovdqu ymm1, [edx + 32] + vpunpcklbw ymm2, ymm3, ymm5 + vpunpckhbw ymm3, ymm3, ymm5 vpaddusw ymm0, ymm0, ymm2 // sum 16 words vpaddusw ymm1, ymm1, ymm3 - sub ebp, 1 - jg xloop - - vmovdqu [edi], ymm0 - vmovdqu [edi + 32], ymm1 - lea edi, [edi + 64] // dst_ptr - lea esi, [esi + 32] // src_ptr - mov eax, esi // row pointer - mov ebp, ebx // height - vpxor ymm0, ymm0, ymm0 // clear accumulators - vpxor ymm1, ymm1, ymm1 + vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] sub ecx, 32 jg xloop - pop ebp - pop ebx - pop edi - pop esi vzeroupper ret } diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index cfffcf8b..54822f06 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -78,7 +78,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ src_u + OFF, \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ @@ -211,7 +211,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ src_u + OFF, \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ @@ -326,7 +326,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ src_uv + OFF, \ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ @@ -435,7 +435,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ @@ -538,7 +538,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ } \ memset(dst_argb_c, 1, kStrideB * kHeight); \ memset(dst_argb_opt, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ dst_argb_c, kWidth * BPP_B, \ @@ -632,7 +632,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ dst_y_c, kWidth, \ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ @@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) #if defined(__arm__) || defined (__aarch64__) +// arm version subsamples by summing 4 pixels then multiplying by matrix with +// 4x smaller coefficients which are rounded to nearest integer. TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) #else TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0) @@ -738,7 +740,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_uv_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ dst_y_c, kWidth, \ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ @@ -814,7 +816,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \ } \ memset(dst_argb_c, 1, kStrideB * kHeightB); \ memset(dst_argb_opt, 101, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \ dst_argb_c, kStrideB, \ kWidth, NEG kHeight); \ @@ -858,7 +860,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ } \ memset(dst_argb_c, 123, kStrideB * kHeightB); \ memset(dst_argb_opt, 123, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B(src_argb, kStrideA, \ dst_argb_c, kStrideB, \ kWidth, kHeight); \ @@ -948,7 +950,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither##N) { \ } \ memset(dst_argb_c, 1, kStrideB * kHeightB); \ memset(dst_argb_opt, 101, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \ dst_argb_c, kStrideB, \ NULL, kWidth, NEG kHeight); \ @@ -992,7 +994,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither_Random) { \ } \ memset(dst_argb_c, 123, kStrideB * kHeightB); \ memset(dst_argb_opt, 123, kStrideB * kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \ dst_argb_c, kStrideB, \ NULL, kWidth, kHeight); \ @@ -1051,7 +1053,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \ } \ memset(dst_argb_c, 1, kStrideA * kHeightA); \ memset(dst_argb_opt, 101, kStrideA * kHeightA); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_ATOB(src_argb + OFF, kStrideA, \ dst_argb_c, kStrideA, \ kWidth, NEG kHeight); \ @@ -1061,7 +1063,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \ dst_argb_opt, kStrideA, \ kWidth, NEG kHeight); \ } \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_ATOB(dst_argb_c, kStrideA, \ dst_argb_c, kStrideA, \ kWidth, NEG kHeight); \ @@ -1470,7 +1472,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ + MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ |