diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-04-27 22:47:36 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-04-27 20:35:27 +0000 |
commit | c9843de02a1e67fdc7560f89c35a7a879804260e (patch) | |
tree | 19fb9e6345701e66cdf610ea9e5ac3655804bf14 /source | |
parent | 5e05f26a2bcacc5d34ee5786e25efee2493003cc (diff) | |
download | libyuv-c9843de02a1e67fdc7560f89c35a7a879804260e.tar.gz |
Optimize unlimited data for Intel
Use unsigned coefficient and signed UV value in YUVTORGB.
R=fbarchard@chromium.org
Bug: libyuv:862, libyuv:863
Change-Id: I32e58b2cee383fb98104c055beb0867a7ad05bfe
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2850016
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source')
-rw-r--r-- | source/row_common.cc | 181 | ||||
-rw-r--r-- | source/row_gcc.cc | 295 | ||||
-rw-r--r-- | source/row_win.cc | 148 |
3 files changed, 299 insertions, 325 deletions
diff --git a/source/row_common.cc b/source/row_common.cc index f9e9a01b..4d0dce2d 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -55,8 +55,8 @@ static __inline int32_t clamp1023(int32_t v) { return (-(v >= 1023) | v) & 1023; } -// clamp to 2^n - 1 -static __inline int32_t clamp2nm1(int32_t v, int32_t max) { +// clamp to max +static __inline int32_t ClampMax(int32_t v, int32_t max) { return (-(v >= max) | v) & max; } @@ -77,7 +77,7 @@ static __inline int32_t clamp1023(int32_t v) { return (v > 1023) ? 1023 : v; } -static __inline int32_t clamp2nm1(int32_t v, int32_t max) { +static __inline int32_t ClampMax(int32_t v, int32_t max) { return (v > max) ? max : v; } @@ -1422,46 +1422,37 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // clang-format off #if defined(__aarch64__) || defined(__arm__) -#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \ - {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ +// Bias values to round, and subtract 128 from U and V. +// For B and R this is negative. For G this is positive. +#define BB (UB * 128 - YB) +#define BG (UG * 128 + VG * 128 + YB) +#define BR (VR * 128 - YB) + +#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ {YG, BB, BG, BR, YB, 0, 0, 0}} #else -#define UVMASK(C) ((C) > 127 ? 0xff : 0) - -#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \ +#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \ - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \ - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ - {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}, \ - {0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \ - 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), \ - 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), \ - 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR)}} + {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} #endif // clang-format on -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ - YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR); \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ - YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB, BR, BG, BB); +#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ + const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ + YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ + const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ + YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB); // TODO(fbarchard): Generate SIMD structures from float matrix. -// Bias values to round, and subtract 128 from U and V. -// For B and R this is negative. For G this is positive. -#define BB (UB * 128 - YB) -#define BG (UG * 128 + VG * 128 + YB) -#define BR (VR * 128 - YB) - // BT.601 limited range YUV to RGB reference // R = (Y - 16) * 1.164 + V * 1.596 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 @@ -1482,7 +1473,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1507,7 +1498,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1536,7 +1527,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1561,7 +1552,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1590,7 +1581,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1614,7 +1605,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1630,25 +1621,39 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #undef MAKEYUVCONSTANTS #if defined(__aarch64__) || defined(__arm__) -#define LOAD_YUV_CONSTANTS \ - int ub = -yuvconstants->kUVCoeff[0]; \ - int vr = -yuvconstants->kUVCoeff[1]; \ - int ug = yuvconstants->kUVCoeff[2]; \ - int vg = yuvconstants->kUVCoeff[3]; \ - int yg = yuvconstants->kRGBCoeffBias[0]; \ - int bb = -yuvconstants->kRGBCoeffBias[1]; \ - int bg = yuvconstants->kRGBCoeffBias[2]; \ - int br = -yuvconstants->kRGBCoeffBias[3] +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVCoeff[0]; \ + int vr = yuvconstants->kUVCoeff[1]; \ + int ug = yuvconstants->kUVCoeff[2]; \ + int vg = yuvconstants->kUVCoeff[3]; \ + int yg = yuvconstants->kRGBCoeffBias[0]; \ + int bb = yuvconstants->kRGBCoeffBias[1]; \ + int bg = yuvconstants->kRGBCoeffBias[2]; \ + int br = yuvconstants->kRGBCoeffBias[3] + +#define CALC_RGB16 \ + int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ + int b16 = y1 + (u * ub) - bb; \ + int g16 = y1 + bg - (u * ug + v * vg); \ + int r16 = y1 + (v * vr) - br #else -#define LOAD_YUV_CONSTANTS \ - int ub = -yuvconstants->kUVToB[0]; \ - int ug = yuvconstants->kUVToG[0]; \ - int vg = yuvconstants->kUVToG[1]; \ - int vr = -yuvconstants->kUVToR[1]; \ - int bb = -yuvconstants->kUVBiasB[0]; \ - int bg = yuvconstants->kUVBiasG[0]; \ - int br = -yuvconstants->kUVBiasR[0]; \ - int yg = yuvconstants->kYToRgb[0] +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVToB[0]; \ + int ug = yuvconstants->kUVToG[0]; \ + int vg = yuvconstants->kUVToG[1]; \ + int vr = yuvconstants->kUVToR[1]; \ + int yg = yuvconstants->kYToRgb[0]; \ + int yb = yuvconstants->kYBiasToRgb[0] + +#define CALC_RGB16 \ + int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ + int8_t ui = u; \ + int8_t vi = v; \ + ui -= 0x80; \ + vi -= 0x80; \ + int b16 = y1 + (ui * ub); \ + int g16 = y1 - (ui * ug + vi * vg); \ + int r16 = y1 + (vi * vr) #endif // C reference code that mimics the YUV assembly. @@ -1661,11 +1666,11 @@ static __inline void YuvPixel(uint8_t y, uint8_t* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6); - *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6); - *r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6); + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); } // Reads 8 bit YUV and leaves result as 16 bit. @@ -1677,11 +1682,11 @@ static __inline void YuvPixel8_16(uint8_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = (int)(y1 - (u * ub) + bb); - *g = (int)(y1 - (u * ug + v * vg) + bg); - *r = (int)(y1 - (v * vr) + br); + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 16 bit assembly. @@ -1694,13 +1699,13 @@ static __inline void YuvPixel10_16(uint16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; + uint32_t y32 = y << 6; u = clamp255(u >> 2); v = clamp255(v >> 2); - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 16 bit assembly. @@ -1713,13 +1718,13 @@ static __inline void YuvPixel12_16(int16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16; + uint32_t y32 = y << 4; u = clamp255(u >> 4); v = clamp255(v >> 4); - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 10 bit assembly. @@ -1768,13 +1773,13 @@ static __inline void YuvPixel16_8(uint16_t y, uint8_t* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)(y * yg) >> 16; + uint32_t y32 = y; u = clamp255(u >> 8); v = clamp255(v >> 8); - *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6); - *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6); - *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6); + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); } // C reference code that mimics the YUV 16 bit assembly. @@ -1787,13 +1792,13 @@ static __inline void YuvPixel16_16(uint16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - - uint32_t y1 = (uint32_t)(y * yg) >> 16; + uint32_t y32 = y; u = clamp255(u >> 8); v = clamp255(v >> 8); - *b = (int)(y1 + -(u * ub) + bb); - *g = (int)(y1 + -(u * ug + v * vg) + bg); - *r = (int)(y1 + -(v * vr) + br); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV assembly. @@ -2779,10 +2784,10 @@ void MergeAR64Row_C(const uint16_t* src_r, int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { - dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; - dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; - dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; - dst_ar64[3] = clamp2nm1(src_a[x], max) << shift; + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[3] = ClampMax(src_a[x], max) << shift; dst_ar64 += 4; } } @@ -2819,9 +2824,9 @@ void MergeXR64Row_C(const uint16_t* src_r, int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { - dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; - dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; - dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; dst_ar64[3] = 0xffff; dst_ar64 += 4; } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index d6c28535..43e4c710 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2312,78 +2312,65 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, #if defined(__x86_64__) #define YUVTORGB_SETUP(yuvconstants) \ + "pcmpeqb %%xmm13,%%xmm13 \n" \ "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "pxor %%xmm12,%%xmm12 \n" \ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "psllw $7,%%xmm13 \n" \ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "pshufb %%xmm12,%%xmm13 \n" \ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ - "movdqa 192(%[yuvconstants]),%%xmm14 \n" \ - "movdqa 256(%[yuvconstants]),%%xmm15 \n" \ - "movdqa 272(%[yuvconstants]),%%xmm7 \n" + "movdqa 128(%[yuvconstants]),%%xmm12 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm3,%%xmm0 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "movdqa %%xmm3,%%xmm2 \n" \ - "pmaddubsw %%xmm8,%%xmm0 \n" \ - "pmaddubsw %%xmm10,%%xmm2 \n" \ - "psllw $8,%%xmm1 \n" \ - "pand %%xmm15,%%xmm1 \n" \ - "paddw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm1 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ - "pand %%xmm7,%%xmm3 \n" \ - "paddw %%xmm3,%%xmm2 \n" \ - "paddw %%xmm4,%%xmm0 \n" \ - "paddw %%xmm4,%%xmm2 \n" \ +#define YUVTORGB16(yuvconstants) \ + "psubb %%xmm13,%%xmm3 \n" \ + "pmulhuw %%xmm11,%%xmm4 \n" \ + "movdqa %%xmm8,%%xmm0 \n" \ + "movdqa %%xmm9,%%xmm1 \n" \ + "movdqa %%xmm10,%%xmm2 \n" \ "paddw %%xmm12,%%xmm4 \n" \ - "psubusw %%xmm11,%%xmm0 \n" \ - "psubusw %%xmm1,%%xmm4 \n" \ - "psubusw %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ "movdqa %%xmm4,%%xmm1 \n" -#define YUVTORGB_REGS \ - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", +#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm3,%%xmm0 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "movdqa %%xmm3,%%xmm2 \n" \ - "pmaddubsw (%[yuvconstants]),%%xmm0 \n" \ - "pmaddubsw 64(%[yuvconstants]),%%xmm2 \n" \ - "psllw $8,%%xmm1 \n" \ - "pand 256(%[yuvconstants]),%%xmm1 \n" \ - "paddw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "pmaddubsw 32(%[yuvconstants]),%%xmm1 \n" \ - "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ - "pand 272(%[yuvconstants]),%%xmm3 \n" \ - "paddw %%xmm3,%%xmm2 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm7 \n" \ - "paddw %%xmm4,%%xmm0 \n" \ - "paddw %%xmm4,%%xmm2 \n" \ - "paddw %%xmm7,%%xmm4 \n" \ - "movdqa 96(%[yuvconstants]),%%xmm7 \n" \ - "psubusw %%xmm7,%%xmm0 \n" \ - "psubusw %%xmm1,%%xmm4 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm7 \n" \ - "psubusw %%xmm7,%%xmm2 \n" \ - "movdqa %%xmm4,%%xmm1 \n" \ - -#define YUVTORGB_REGS "xmm7", +#define YUVTORGB16(yuvconstants) \ + "pcmpeqb %%xmm0,%%xmm0 \n" \ + "pxor %%xmm1,%%xmm1 \n" \ + "psllw $7,%%xmm0 \n" \ + "pshufb %%xmm1,%%xmm0 \n" \ + "psubb %%xmm0,%%xmm3 \n" \ + "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ + "movdqa (%[yuvconstants]),%%xmm0 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ + "paddw %%xmm3,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + +#define YUVTORGB_REGS #endif #define YUVTORGB(yuvconstants) \ YUVTORGB16(yuvconstants) \ - "psrlw $0x6,%%xmm0 \n" \ - "psrlw $0x6,%%xmm1 \n" \ - "psrlw $0x6,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ "packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" @@ -2416,9 +2403,12 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "psraw $0x4,%%xmm0 \n" \ "psraw $0x4,%%xmm1 \n" \ "psraw $0x4,%%xmm2 \n" \ - "pminuw %%xmm6,%%xmm0 \n" \ - "pminuw %%xmm6,%%xmm1 \n" \ - "pminuw %%xmm6,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ "psllw $0x4,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \ @@ -2588,8 +2578,9 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -2605,7 +2596,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -2682,8 +2673,9 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -2699,7 +2691,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -2716,8 +2708,9 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -2733,7 +2726,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -2850,8 +2843,9 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -2867,7 +2861,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -3076,8 +3070,9 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -3092,7 +3087,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -3106,8 +3101,9 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $6,%%xmm6 \n" // 1023 for max + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" @@ -3122,7 +3118,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -3360,70 +3356,58 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, #if defined(__x86_64__) #define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vpbroadcastb %%xmm13,%%ymm13 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ - "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ - "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ - "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" \ - "vbroadcastf128 256(%[yuvconstants]),%%ymm15 \n" \ - "vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n" + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" -// TODO(yuan): Consider signed UV and unsigned coefficient for vpmaddubsw. #define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm8,%%ymm3,%%ymm0 \n" \ - "vpmaddubsw %%ymm10,%%ymm3,%%ymm2 \n" \ - "vpsllw $8,%%ymm3,%%ymm1 \n" \ - "vpand %%ymm1,%%ymm15,%%ymm1 \n" \ - "vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpmaddubsw %%ymm9,%%ymm3,%%ymm1 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ - "vpand %%ymm3,%%ymm7,%%ymm3 \n" \ - "vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddw %%ymm4,%%ymm12,%%ymm3 \n" \ - "vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsubusw %%ymm11,%%ymm0,%%ymm0 \n" \ - "vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vpsubusw %%ymm13,%%ymm2,%%ymm2 \n" + "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ + "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ + "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ + "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 \ - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw (%[yuvconstants]),%%ymm3,%%ymm0 \n" \ - "vpmaddubsw 64(%[yuvconstants]),%%ymm3,%%ymm2 \n" \ - "vpsllw $8,%%ymm3,%%ymm1 \n" \ - "vbroadcastf128 256(%[yuvconstants]),%%ymm7 \n" \ - "vpand %%ymm7,%%ymm1,%%ymm1 \n" \ - "vpaddw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpmaddubsw 32(%[yuvconstants]),%%ymm3,%%ymm1 \n" \ - "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ - "vbroadcastf128 272(%[yuvconstants]),%%ymm7 \n" \ - "vpand %%ymm7,%%ymm3,%%ymm3 \n" \ - "vpaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vmovdqu 128(%[yuvconstants]),%%ymm7 \n" \ - "vpaddw %%ymm4,%%ymm7,%%ymm3 \n" \ - "vpaddw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vmovdqu 96(%[yuvconstants]),%%ymm7 \n" \ - "vpsubusw %%ymm7,%%ymm0,%%ymm0 \n" \ - "vpsubusw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu 160(%[yuvconstants]),%%ymm7 \n" \ - "vpsubusw %%ymm7,%%ymm2,%%ymm2 \n" - -#define YUVTORGB_REGS_AVX2 "xmm7", +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ + "vpsllw $7,%%xmm0,%%xmm0 \n" \ + "vpbroadcastb %%xmm0,%%ymm0 \n" \ + "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ + "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ + "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + +#define YUVTORGB_REGS_AVX2 #endif #define YUVTORGB_AVX2(yuvconstants) \ YUVTORGB16_AVX2(yuvconstants) \ - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsrlw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsrlw $0x6,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" @@ -3438,16 +3422,19 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vmovdqu %%ymm1,(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ - "lea 0x40(%[dst_argb]), %[dst_argb] \n" + "lea 0x40(%[dst_argb]), %[dst_argb] \n" // Store 16 AR30 values. #define STOREAR30_AVX2 \ - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" \ - "vpsrlw $0x4,%%ymm1,%%ymm1 \n" \ - "vpsrlw $0x4,%%ymm2,%%ymm2 \n" \ - "vpminuw %%ymm6,%%ymm0,%%ymm0 \n" \ - "vpminuw %%ymm6,%%ymm1,%%ymm1 \n" \ - "vpminuw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ @@ -3548,8 +3535,9 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -3567,7 +3555,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I422TOAR30ROW_AVX2 @@ -3657,8 +3645,9 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -3676,7 +3665,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I210TOAR30ROW_AVX2 @@ -3696,8 +3685,9 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -3715,7 +3705,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I212TOAR30ROW_AVX2 @@ -3842,8 +3832,9 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -3861,7 +3852,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I410TOAR30ROW_AVX2 @@ -4204,8 +4195,9 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -4240,8 +4232,9 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 1023 for max - "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" @@ -4269,8 +4262,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" @@ -4314,8 +4307,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 - "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 8d633c3f..2c3241cf 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -75,28 +75,18 @@ extern "C" { // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(yuvconstants) \ - xmm0 = _mm_loadu_si128(&xmm3); \ - xmm1 = _mm_loadu_si128(&xmm3); \ - xmm2 = _mm_loadu_si128(&xmm3); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm1 = _mm_slli_epi16(xmm1, 8); \ - xmm1 = _mm_and_si128(xmm1, *(__m128i*)yuvconstants->kUVMaskBR); \ - xmm0 = _mm_add_epi16(xmm0, xmm1); \ - xmm1 = _mm_loadu_si128(&xmm3); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm3 = _mm_and_si128(xmm3, *((__m128i*)(yuvconstants->kUVMaskBR) + 1)); \ - xmm2 = _mm_add_epi16(xmm2, xmm3); \ - xmm0 = _mm_add_epi16(xmm0, xmm4); \ - xmm2 = _mm_add_epi16(xmm2, xmm4); \ - xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kUVBiasG); \ - xmm0 = _mm_subs_epu16(xmm0, *(__m128i*)yuvconstants->kUVBiasB); \ - xmm1 = _mm_subs_epu16(xmm4, xmm1); \ - xmm2 = _mm_subs_epu16(xmm2, *(__m128i*)yuvconstants->kUVBiasR); \ - xmm0 = _mm_srli_epi16(xmm0, 6); \ - xmm1 = _mm_srli_epi16(xmm1, 6); \ - xmm2 = _mm_srli_epi16(xmm2, 6); \ + xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ + xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ + xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ + xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ + xmm0 = _mm_adds_epi16(xmm4, xmm0); \ + xmm1 = _mm_subs_epi16(xmm4, xmm1); \ + xmm2 = _mm_adds_epi16(xmm4, xmm2); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ xmm0 = _mm_packus_epi16(xmm0, xmm0); \ xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm2 = _mm_packus_epi16(xmm2, xmm2); @@ -254,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, // 7 bit fixed point 0.5. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +// 8 bit fixed point 0.5, for bias of UV. +static const ulvec8 kBiasUV128 = { + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { @@ -1447,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1519,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUVJ128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1593,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1661,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUVJ128 + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToVJ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1726,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1787,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU sub edi, edx // stride from u to v @@ -1859,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU sub edi, edx // stride from u to v @@ -1931,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU sub edi, edx // stride from u to v @@ -2097,33 +2087,26 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpmaddubsw ymm0, ymm3, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ - __asm vpmaddubsw ymm2, ymm3, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ - __asm vpsllw ymm1, ymm3, 8 \ - __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KUMASKB] \ - __asm vpand ymm1, ymm1, ymm6 \ - __asm vpaddw ymm0, ymm0, ymm1 \ - __asm vpmaddubsw ymm1, ymm3, ymmword ptr [YuvConstants + KUVTOG] /* B UV */\ + __asm { \ + __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KVMASKR] \ - __asm vpand ymm3, ymm3, ymm6 \ - __asm vpaddw ymm2, ymm2, ymm3 \ - __asm vpaddw ymm0, ymm0, ymm4 \ - __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASG] \ - __asm vpaddw ymm3, ymm4, ymm6 \ - __asm vpaddw ymm2, ymm2, ymm4 \ - __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubusw ymm0, ymm0, ymm6 \ - __asm vpsubusw ymm1, ymm3, ymm1 \ - __asm vmovdqu ymm6, ymmword ptr [YuvConstants + KUVBIASR] \ - __asm vpsubusw ymm2, ymm2, ymm6 \ - __asm vpsrlw ymm0, ymm0, 6 \ - __asm vpsrlw ymm1, ymm1, 6 \ - __asm vpsrlw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ + __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ + __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ + __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ + __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ + __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ + __asm vpaddw ymm4, ymm3, ymm4 \ + __asm vpaddsw ymm0, ymm0, ymm4 \ + __asm vpsubsw ymm1, ymm4, ymm1 \ + __asm vpaddsw ymm2, ymm2, ymm4 \ + __asm vpsraw ymm0, ymm0, 6 \ + __asm vpsraw ymm1, ymm1, 6 \ + __asm vpsraw ymm2, ymm2, 6 \ + __asm vpackuswb ymm0, ymm0, ymm0 \ + __asm vpackuswb ymm1, ymm1, ymm1 \ + __asm vpackuswb ymm2, ymm2, ymm2 \ } // Store 16 ARGB values. @@ -2582,34 +2565,27 @@ __declspec(naked) void I422ToRGBARow_AVX2( // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) \ - __asm { \ - __asm movdqa xmm0, xmm3 \ - __asm movdqa xmm1, xmm3 \ - __asm movdqa xmm2, xmm3 \ - __asm pmaddubsw xmm0, xmmword ptr [YuvConstants + KUVTOB] \ - __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOR] \ - __asm psllw xmm1, 8 \ - __asm pand xmm1, xmmword ptr [YuvConstants + KUMASKB] \ - __asm paddw xmm0, xmm1 \ - __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOG] \ + __asm { \ + __asm psubb xmm3, xmmword ptr kBiasUV128 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm pand xmm3, xmmword ptr [YuvConstants + KVMASKR] \ - __asm paddw xmm0, xmm4 \ - __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ - __asm paddw xmm2, xmm4 \ - __asm paddw xmm4, xmm6 \ - __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ - __asm psubusw xmm0, xmm6 \ - __asm psubusw xmm4, xmm1 \ - __asm movdqa xmm6, xmmword ptr [YuvConstants + KUVBIASG] \ - __asm psubusw xmm2, xmm6 \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ + __asm pmaddubsw xmm0, xmm3 \ + __asm pmaddubsw xmm1, xmm3 \ + __asm pmaddubsw xmm2, xmm3 \ + __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ + __asm paddw xmm4, xmm3 \ + __asm paddsw xmm0, xmm4 \ + __asm paddsw xmm2, xmm4 \ + __asm psubsw xmm4, xmm1 \ __asm movdqa xmm1, xmm4 \ - __asm psrlw xmm0, 6 \ - __asm psrlw xmm1, 6 \ - __asm psrlw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. |