diff options
author | Frank Barchard <fbarchard@google.com> | 2021-10-15 12:12:02 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2021-10-15 19:46:02 +0000 |
commit | 55b97cb48f027d2af417ce1f895cefad2ed1ce23 (patch) | |
tree | b1091fee47160f8027abc07ceb0ef5701eb5af86 | |
parent | 11cbf8f976a41ccb279dc67489832ea9f12d56d7 (diff) | |
download | libyuv-55b97cb48f027d2af417ce1f895cefad2ed1ce23.tar.gz |
BIT_EXACT for unattenuate and attenuate.
- reenable Intel SIMD unaffected by BIT_EXACT
- add bit exact version of ARGBAttenuate, which uses ARM version of formula.
- add bit exact version of ARGBUnatenuate, which mimics the AVX code.
Apply clang format to cleanup code.
Bug: libyuv:908, b/202888439
Change-Id: Ie842b1b3956b48f4190858e61c02998caedc2897
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3224702
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/compare_row.h | 2 | ||||
-rw-r--r-- | include/libyuv/planar_functions.h | 2 | ||||
-rw-r--r-- | include/libyuv/rotate_row.h | 2 | ||||
-rw-r--r-- | include/libyuv/row.h | 32 | ||||
-rw-r--r-- | include/libyuv/scale_row.h | 2 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert.cc | 24 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 24 | ||||
-rw-r--r-- | source/row_any.cc | 2 | ||||
-rw-r--r-- | source/row_common.cc | 36 | ||||
-rw-r--r-- | source/row_gcc.cc | 5 | ||||
-rw-r--r-- | source/row_neon64.cc | 7 | ||||
-rw-r--r-- | source/scale.cc | 6 | ||||
-rw-r--r-- | source/scale_argb.cc | 8 | ||||
-rw-r--r-- | source/scale_uv.cc | 22 | ||||
-rw-r--r-- | unit_test/color_test.cc | 3 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 22 | ||||
-rw-r--r-- | unit_test/planar_test.cc | 29 | ||||
-rw-r--r-- | unit_test/scale_argb_test.cc | 30 | ||||
-rw-r--r-- | unit_test/scale_test.cc | 10 | ||||
-rw-r--r-- | unit_test/scale_uv_test.cc | 14 |
22 files changed, 179 insertions, 107 deletions
diff --git a/README.chromium b/README.chromium index 6ecadd7b..1195fe5d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1798 +Version: 1799 License: BSD License File: LICENSE diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 7df7acc6..64115b3a 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -18,7 +18,7 @@ namespace libyuv { extern "C" { #endif -#if defined(LIBYUV_BIT_EXACT) || defined(__pnacl__) || defined(__CLR_VER) || \ +#if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 1efd651a..def773cb 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -23,7 +23,7 @@ extern "C" { #endif // TODO(fbarchard): Move cpu macros to row.h -#if defined(LIBYUV_BIT_EXACT) || defined(__pnacl__) || defined(__CLR_VER) || \ +#if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index a60f8eac..f4c701fb 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -18,7 +18,7 @@ namespace libyuv { extern "C" { #endif -#if defined(LIBYUV_BIT_EXACT) || defined(__pnacl__) || defined(__CLR_VER) || \ +#if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 53ab0335..2f61a581 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -20,7 +20,7 @@ namespace libyuv { extern "C" { #endif -#if defined(LIBYUV_BIT_EXACT) || defined(__pnacl__) || defined(__CLR_VER) || \ +#if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 @@ -74,8 +74,10 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) // Conversions: +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 +#endif #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBEXTRACTALPHAROW_SSE2 @@ -87,13 +89,15 @@ extern "C" { #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOYJROW_SSSE3 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 -#define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 +#endif #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 @@ -119,14 +123,16 @@ extern "C" { #define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB24TOYJROW_SSSE3 #define HAS_RAWTOYJROW_SSSE3 -#define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 +#endif #define HAS_SETROW_ERMS #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 @@ -142,7 +148,9 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBATTENUATEROW_SSSE3 +#endif #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 @@ -192,12 +200,14 @@ extern "C" { #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_RAWTOYJROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBTOUVJROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_RGB24TOYJROW_AVX2 -#define HAS_RAWTOYJROW_AVX2 +#endif #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 @@ -230,7 +240,9 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBATTENUATEROW_AVX2 +#endif #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 @@ -297,7 +309,9 @@ extern "C" { #define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_SSSE3 +#endif #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 #define HAS_SPLITXRGBROW_SSE2 @@ -319,8 +333,10 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ABGRTOAR30ROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_AVX2 #define HAS_ABGRTOYROW_AVX2 +#endif #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 @@ -356,7 +372,9 @@ extern "C" { #define HAS_MERGEUVROW_16_AVX2 #define HAS_MIRRORUVROW_AVX2 #define HAS_MULTIPLYROW_16_AVX2 +#if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_AVX2 +#endif #define HAS_SPLITARGBROW_AVX2 #define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index fb668ff0..461ac36f 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -19,7 +19,7 @@ namespace libyuv { extern "C" { #endif -#if defined(LIBYUV_BIT_EXACT) || defined(__pnacl__) || defined(__CLR_VER) || \ +#if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2775c27a..5fd85355 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1798 +#define LIBYUV_VERSION 1799 #endif // INCLUDE_LIBYUV_VERSION_H_
\ No newline at end of file diff --git a/source/convert.cc b/source/convert.cc index 69f7fb6e..2bffaa43 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2004,16 +2004,22 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; @@ -2347,16 +2353,22 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 8285b6c3..9a015583 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1830,16 +1830,22 @@ int ARGBToJ420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; @@ -1939,16 +1945,22 @@ int ARGBToJ422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; diff --git a/source/row_any.cc b/source/row_any.cc index 5b113fb4..1b8176bd 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -974,6 +974,8 @@ ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_SSE2 ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index a5ab81f2..092e538e 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -28,14 +28,20 @@ extern "C" { // The following macro from row_win makes the C code match the row_win code, // which is 7 bit fixed point for ARGBToI420: -#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) +#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ + defined(_MSC_VER) && !defined(__clang__) && \ + (defined(_M_IX86) || defined(_M_X64)) #define LIBYUV_RGB7 1 #endif -#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) +#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 #define LIBYUV_RGBTOU_TRUNCATE 1 +#define LIBYUV_ATTENUATE_DUP 1 +#endif +#if defined(LIBYUV_BIT_EXACT) +#define LIBYUV_UNATTENUATE_DUP 1 #endif // llvm x86 is poor at ternary operator, so use branchless min/max. @@ -3151,11 +3157,11 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND -#if defined(__aarch64__) || defined(__arm__) -#define ATTENUATE(f, a) (f * a + 128) >> 8 -#else +#if LIBYUV_ATTENUATE_DUP // This code mimics the SSSE3 version for better testability. #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 +#else +#define ATTENUATE(f, a) (f * a + 128) >> 8 #endif // Multiply source RGB by alpha and store to destination. @@ -3242,6 +3248,14 @@ const uint32_t fixed_invtbl8[256] = { T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T +#if LIBYUV_UNATTENUATE_DUP +// This code mimics the Intel SIMD version for better testability. +#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16) +#else +#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8) +#endif + +// mimics the Intel SIMD code for exactness. void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { @@ -3252,13 +3266,11 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint32_t r = src_argb[2]; const uint32_t a = src_argb[3]; const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point - b = (b * ia) >> 8; - g = (g * ia) >> 8; - r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. - dst_argb[0] = clamp255(b); - dst_argb[1] = clamp255(g); - dst_argb[2] = clamp255(r); + dst_argb[0] = UNATTENUATE(b, ia); + dst_argb[1] = UNATTENUATE(g, ia); + dst_argb[2] = UNATTENUATE(r, ia); dst_argb[3] = a; src_argb += 4; dst_argb += 4; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 001c353d..a5f73989 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1417,9 +1417,12 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { } #endif // HAS_RGBATOYJROW_SSSE3 -#ifdef HAS_ARGBTOYROW_AVX2 +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 919f7f22..034b3117 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1373,10 +1373,10 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64\n" // load 16 ARGB - "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w2, %w2, #16 \n" // 16 pixels per loop. "prfm pldl1keep, [%0, 448] \n" - "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48\n" // store 8 RGB24 + "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 @@ -1683,7 +1683,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } - void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { diff --git a/source/scale.cc b/source/scale.cc index cda10e2b..94a7b89d 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1829,7 +1829,8 @@ static void ScalePlaneSimple(int src_width, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } @@ -1870,7 +1871,8 @@ static void ScalePlaneSimple_16(int src_width, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 073df1ae..66b69d8f 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -888,8 +888,8 @@ static void ScaleARGBSimple(int src_width, } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride, dst_width, x, - dx); + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride, + dst_width, x, dx); dst_argb += dst_stride; y += dy; } @@ -973,8 +973,8 @@ static void ScaleARGB(const uint8_t* src, filtering = kFilterNone; if (dx == 0x10000 && dy == 0x10000) { // Straight copy. - ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4, src_stride, - dst, dst_stride, clip_width, clip_height); + ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4, + src_stride, dst, dst_stride, clip_width, clip_height); return; } } diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 8a7f2abb..c90d62c4 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -690,7 +690,8 @@ void ScaleUVLinearUp2(int src_width, #endif if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, dst_width); + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; @@ -796,7 +797,8 @@ void ScaleUVLinearUp2_16(int src_width, #endif if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, dst_width); + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; @@ -927,7 +929,8 @@ static void ScaleUVSimple(int src_width, } for (j = 0; j < dst_height; ++j) { - ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); + ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_uv += dst_stride; y += dy; } @@ -1061,8 +1064,8 @@ static void ScaleUV(const uint8_t* src, #ifdef HAS_UVCOPY if (dx == 0x10000 && dy == 0x10000) { // Straight copy. - UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2, src_stride, dst, - dst_stride, clip_width, clip_height); + UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2, + src_stride, dst, dst_stride, clip_width, clip_height); return; } #endif @@ -1163,12 +1166,13 @@ int UVScale_16(const uint16_t* src_uv, #ifdef HAS_UVCOPY if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { if (dst_height == 1) { - UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv, src_stride_uv, - dst_uv, dst_stride_uv, dst_width, dst_height); + UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv, + src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); } else { dy = src_height / dst_height; - UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv, dy * (int64_t)src_stride_uv, - dst_uv, dst_stride_uv, dst_width, dst_height); + UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv, + dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width, + dst_height); } return 0; diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc index 1aa03a41..a3f23ac4 100644 --- a/unit_test/color_test.cc +++ b/unit_test/color_test.cc @@ -22,7 +22,8 @@ namespace libyuv { // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB. // Port to Visual C and other CPUs -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) #define ERROR_FULL 5 #define ERROR_J420 4 #else diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 9cb76663..a5cc9feb 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1532,7 +1532,7 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) // e.g. endian swap twice. #define TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, \ OFF) \ - TEST_F(LibYUVConvertTest, FMT_ATOB##_Endswap##N) { \ + TEST_F(LibYUVConvertTest, FMT_ATOB##_Endswap##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ @@ -1580,8 +1580,8 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ _Opt, +, 0) #else -#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ - TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ +#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ + TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ _Opt, +, 0) #endif @@ -3509,9 +3509,9 @@ TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1) TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH) #else -#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, S_DEPTH) \ - TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ +#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) #endif @@ -4140,17 +4140,17 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24 align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2); - int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) / (kSize * 2) * benchmark_iterations_; + int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) / + (kSize * 2) * benchmark_iterations_; for (int i = 0; i < kSize * 3 * 2; ++i) { orig_rgb24[i] = i; } for (int i = 0; i < iterations256; ++i) { - RGB24ToJ420(orig_rgb24, kSize * 3, - dest_j420, kSize, // Y plane - dest_j420 + kSize * 2, kSize / 2, // U plane - dest_j420 + kSize * 5 / 2, kSize / 2, // V plane + RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize, // Y plane + dest_j420 + kSize * 2, kSize / 2, // U plane + dest_j420 + kSize * 5 / 2, kSize / 2, // V plane kSize, 2); } diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 50fa1461..4ea79e01 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -29,6 +29,12 @@ #include "libyuv/row.h" /* For ScaleSumSamples_Neon */ #endif +#if defined(LIBYUV_BIT_EXACT) +#define EXPECTED_ATTENUATE_DIFF 0 +#else +#define EXPECTED_ATTENUATE_DIFF 2 +#endif + namespace libyuv { TEST_F(LibYUVPlanarTest, TestAttenuate) { @@ -100,9 +106,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); - EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1); - EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1); - EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); + EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF); + EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF); + EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); @@ -158,28 +164,29 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, 2); + + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } static int TestUnattenuateI(int width, @@ -231,28 +238,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) { diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index f6ca1f54..9bd61414 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -258,20 +258,20 @@ static int ARGBClipTestFilter(int src_width, // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #ifndef DISABLE_SLOW_TESTS -#define TEST_FACTOR(name, nom, denom) \ +#define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(, name, None, nom, denom, 0) \ TEST_FACTOR1(, name, Linear, nom, denom, 3) \ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(, name, Box, nom, denom, 3) #else #if defined(ENABLE_FULL_TESTS) -#define TEST_FACTOR(name, nom, denom) \ +#define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3) #else -#define TEST_FACTOR(name, nom, denom) \ +#define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) #endif #endif @@ -320,24 +320,24 @@ TEST_FACTOR(3, 1, 3) #ifndef DISABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(, name, width, height, None, 0) \ - TEST_SCALETO1(, name, width, height, Linear, 3) \ +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(, name, width, height, None, 0) \ + TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) #else #if defined(ENABLE_FULL_TESTS) -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ - TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ + TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) #else -#define TEST_SCALETO(name, width, height) \ +#define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) #endif #endif TEST_SCALETO(ARGBScale, 1, 1) -//TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ +// TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) @@ -524,8 +524,8 @@ TEST_F(LibYUVScaleTest, ARGBTest3x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); @@ -561,8 +561,8 @@ TEST_F(LibYUVScaleTest, ARGBTest4x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index ba4e0943..8cba0420 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -1044,7 +1044,7 @@ TEST_FACTOR(3, 1, 3, 0) #endif TEST_SCALETO(Scale, 1, 1) -//TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ +// TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) @@ -1418,8 +1418,8 @@ TEST_F(LibYUVScaleTest, PlaneTest3x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); @@ -1446,8 +1446,8 @@ TEST_F(LibYUVScaleTest, PlaneTest4x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc index b53411bc..a185fb63 100644 --- a/unit_test/scale_uv_test.cc +++ b/unit_test/scale_uv_test.cc @@ -141,7 +141,7 @@ static int UVTestFilter(int src_width, TEST_FACTOR1(name, Box, nom, denom, 3) #else // Test a scale factor with Bilinear. -#define TEST_FACTOR(name, nom, denom) \ +#define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(name, Bilinear, nom, denom, 3) #endif @@ -178,12 +178,12 @@ TEST_FACTOR(3, 1, 3) TEST_SCALETO1(name, width, height, Linear, 3) \ TEST_SCALETO1(name, width, height, Bilinear, 3) #else -#define TEST_SCALETO(name, width, height) \ +#define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, Bilinear, 3) #endif TEST_SCALETO(UVScale, 1, 1) -//TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ +// TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(UVScale, 320, 240) TEST_SCALETO(UVScale, 569, 480) TEST_SCALETO(UVScale, 640, 360) @@ -224,8 +224,8 @@ TEST_F(LibYUVScaleTest, UVTest3x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); @@ -255,8 +255,8 @@ TEST_F(LibYUVScaleTest, UVTest4x) { } align_buffer_page_end(dest_pixels, kDstStride); - int iterations160 = - (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; + int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * + benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { UVScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); |