aboutsummaryrefslogtreecommitdiff
path: root/files/source/row_win.cc
diff options
context:
space:
mode:
Diffstat (limited to 'files/source/row_win.cc')
-rw-r--r--files/source/row_win.cc983
1 files changed, 438 insertions, 545 deletions
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 202f2b8d..27e3da7b 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -28,27 +28,27 @@ extern "C" {
#if defined(_M_X64)
// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
y_buf += 8;
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
@@ -84,15 +84,15 @@ extern "C" {
dst_argb += 32;
#if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUV422
YUVTORGB(yuvconstants)
@@ -103,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
#endif
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4, xmm5;
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
YUVTORGB(yuvconstants)
@@ -255,8 +255,8 @@ static const lvec8 kShuffleNV21 = {
};
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
- uint8* dst_argb,
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_y
@@ -285,8 +285,8 @@ __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
#ifdef HAS_J400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
- uint8* dst_argb,
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_y
@@ -316,8 +316,8 @@ __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
}
#endif // HAS_J400TOARGBROW_AVX2
-__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
- uint8* dst_argb,
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_rgb24
@@ -355,8 +355,8 @@ __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
}
}
-__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
- uint8* dst_argb,
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_raw
@@ -394,8 +394,8 @@ __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
}
}
-__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
- uint8* dst_rgb24,
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
int width) {
__asm {
mov eax, [esp + 4] // src_raw
@@ -430,8 +430,8 @@ __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions.
-__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
- uint8* dst_argb,
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -486,8 +486,8 @@ __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
// v * 256 + v * 8
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
- uint8* dst_argb,
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -537,8 +537,8 @@ __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
#endif // HAS_RGB565TOARGBROW_AVX2
#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
- uint8* dst_argb,
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -589,8 +589,8 @@ __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
#endif // HAS_ARGB1555TOARGBROW_AVX2
#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
- uint8* dst_argb,
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
@@ -627,8 +627,8 @@ __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
#endif // HAS_ARGB4444TOARGBROW_AVX2
// 24 instructions
-__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
- uint8* dst_argb,
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -680,8 +680,8 @@ __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
}
// 18 instructions.
-__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
- uint8* dst_argb,
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
@@ -718,8 +718,8 @@ __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
}
}
-__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -757,8 +757,8 @@ __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
}
}
-__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -796,8 +796,8 @@ __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
}
}
-__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -834,9 +834,9 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
}
}
-__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
__asm {
@@ -881,9 +881,9 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
}
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -925,8 +925,8 @@ __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
#endif // HAS_ARGBTORGB565DITHERROW_AVX2
// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -967,8 +967,8 @@ __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
}
}
-__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -998,8 +998,8 @@ __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
}
#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1036,8 +1036,8 @@ __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
#endif // HAS_ARGBTORGB565ROW_AVX2
#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1077,8 +1077,8 @@ __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
#endif // HAS_ARGBTOARGB1555ROW_AVX2
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1109,8 +1109,8 @@ __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1145,8 +1145,8 @@ __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1185,8 +1185,8 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1225,8 +1225,8 @@ __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1265,8 +1265,8 @@ __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBTOYJROW_AVX2
-__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1299,8 +1299,8 @@ __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
}
}
-__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1333,8 +1333,8 @@ __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
}
}
-__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1367,10 +1367,10 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
}
}
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1439,10 +1439,10 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
psraw xmm1, 8
packsswb xmm0, xmm1
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1513,10 +1513,10 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned
- // step 3 - store 16 U and 16 V values
+ // step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@@ -1581,10 +1581,10 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- // step 3 - store 16 U and 16 V values
+ // step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@@ -1649,9 +1649,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -1707,10 +1707,10 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1779,10 +1779,10 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1851,10 +1851,10 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
}
}
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -2065,10 +2065,10 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void I422ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2105,11 +2105,11 @@ __declspec(naked) void I422ToARGBRow_AVX2(
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
__declspec(naked) void I422AlphaToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2148,10 +2148,10 @@ __declspec(naked) void I422AlphaToARGBRow_AVX2(
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void I444ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2187,9 +2187,9 @@ __declspec(naked) void I444ToARGBRow_AVX2(
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void NV12ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2222,9 +2222,9 @@ __declspec(naked) void NV12ToARGBRow_AVX2(
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void NV21ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2257,8 +2257,8 @@ __declspec(naked) void NV21ToARGBRow_AVX2(
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked) void YUY2ToARGBRow_AVX2(
- const uint8* src_yuy2,
- uint8* dst_argb,
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2288,8 +2288,8 @@ __declspec(naked) void YUY2ToARGBRow_AVX2(
// 16 pixels.
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked) void UYVYToARGBRow_AVX2(
- const uint8* src_uyvy,
- uint8* dst_argb,
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2319,10 +2319,10 @@ __declspec(naked) void UYVYToARGBRow_AVX2(
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
__declspec(naked) void I422ToRGBARow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2551,10 +2551,10 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void I444ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2588,10 +2588,10 @@ __declspec(naked) void I444ToARGBRow_SSSE3(
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) void I422ToRGB24Row_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2626,10 +2626,10 @@ __declspec(naked) void I422ToRGB24Row_SSSE3(
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked) void I422ToRGB565Row_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb565_buf,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb565_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2669,10 +2669,10 @@ __declspec(naked) void I422ToRGB565Row_SSSE3(
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void I422ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2706,11 +2706,11 @@ __declspec(naked) void I422ToARGBRow_SSSE3(
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
__declspec(naked) void I422AlphaToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2746,9 +2746,9 @@ __declspec(naked) void I422AlphaToARGBRow_SSSE3(
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void NV12ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2778,9 +2778,9 @@ __declspec(naked) void NV12ToARGBRow_SSSE3(
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void NV21ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2810,8 +2810,8 @@ __declspec(naked) void NV21ToARGBRow_SSSE3(
// 8 pixels.
// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked) void YUY2ToARGBRow_SSSE3(
- const uint8* src_yuy2,
- uint8* dst_argb,
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2838,8 +2838,8 @@ __declspec(naked) void YUY2ToARGBRow_SSSE3(
// 8 pixels.
// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked) void UYVYToARGBRow_SSSE3(
- const uint8* src_uyvy,
- uint8* dst_argb,
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2864,10 +2864,10 @@ __declspec(naked) void UYVYToARGBRow_SSSE3(
}
__declspec(naked) void I422ToRGBARow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2900,8 +2900,8 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
psrlw xmm0, 6
packuswb xmm0, xmm0 // G
- // Step 2: Weave into ARGB
+ // Step 2: Weave into ARGB
punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
@@ -2947,8 +2947,8 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
- uint8* rgb_buf,
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
- // TODO(fbarchard): Weave alpha with unpack.
- // Step 2: Weave into ARGB
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
@@ -3000,8 +3000,8 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) void MirrorRow_SSSE3(const uint8* src,
- uint8* dst,
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3022,7 +3022,9 @@ __declspec(naked) void MirrorRow_SSSE3(const uint8* src,
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
@@ -3048,9 +3050,9 @@ __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3079,8 +3081,8 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3105,8 +3107,8 @@ __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
// Shuffle table for reversing the bytes.
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3127,9 +3129,9 @@ __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3167,9 +3169,9 @@ __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3207,9 +3209,9 @@ __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
__asm {
push edi
@@ -3239,9 +3241,9 @@ __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
__asm {
push edi
@@ -3273,12 +3275,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
test eax, 15
jne convertloopu
test edx, 15
@@ -3310,12 +3314,14 @@ __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
convertloop:
vmovdqu ymm0, [eax]
@@ -3334,13 +3340,15 @@ __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_AVX
// Multiple of 1.
-__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep movsb
mov edi, edx
mov esi, eax
@@ -3350,13 +3358,13 @@ __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
@@ -3387,13 +3395,13 @@ __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
@@ -3417,8 +3425,8 @@ __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
- uint8* dst_a,
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -3445,8 +3453,8 @@ __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
- uint8* dst_a,
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -3481,13 +3489,13 @@ __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
@@ -3520,13 +3528,13 @@ __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
@@ -3551,16 +3559,16 @@ __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
__asm {
movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
mul edx // overwrites edx with upper part of result.
mov edx, edi
mov edi, [esp + 4] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
shr ecx, 2
rep stosd
mov edi, edx
@@ -3568,26 +3576,28 @@ __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
}
}
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v8
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep stosb
mov edi, edx
ret
}
}
-// Write 'count' 32 bit values.
-__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+ uint32_t v32,
+ int width) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep stosd
mov edi, edx
ret
@@ -3596,8 +3606,8 @@ __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
- uint8* dst_y,
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_yuy2
@@ -3623,10 +3633,10 @@ __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3669,9 +3679,9 @@ __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3709,8 +3719,8 @@ __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
}
}
-__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y,
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_uyvy
@@ -3734,10 +3744,10 @@ __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
}
}
-__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3780,9 +3790,9 @@ __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
}
}
-__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3822,8 +3832,8 @@ __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y,
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_yuy2
@@ -3847,10 +3857,10 @@ __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3892,9 +3902,9 @@ __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3929,8 +3939,8 @@ __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
}
}
-__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y,
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_uyvy
@@ -3952,10 +3962,10 @@ __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
}
}
-__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3997,9 +4007,9 @@ __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
}
}
-__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -4041,10 +4051,10 @@ __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
__asm {
push esi
@@ -4067,7 +4077,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
sub edx, esi
sub edi, esi
- // 8 pixel loop.
+ // 8 pixel loop.
convertloop8:
movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
@@ -4098,10 +4108,10 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
__asm {
push esi
@@ -4123,7 +4133,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
sub edx, esi
sub edi, esi
- // 32 pixel loop.
+ // 32 pixel loop.
convertloop32:
vmovdqu ymm0, [esi] // alpha
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
@@ -4162,9 +4172,9 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4183,7 +4193,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
sub ecx, 4
jl convertloop4b // less than 4 pixels?
- // 4 pixel loop.
+ // 4 pixel loop.
convertloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
@@ -4212,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
add ecx, 4 - 1
jl convertloop1b
- // 1 pixel loop.
+ // 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
@@ -4253,8 +4263,8 @@ static const uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
-__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4298,8 +4308,8 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
128u, 128u, 14u, 15u, 14u, 15u,
14u, 15u, 128u, 128u};
-__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4336,8 +4346,8 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
push ebx
@@ -4392,8 +4402,8 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction.
#ifdef USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4426,8 +4436,8 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
}
}
#else // USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
@@ -4495,8 +4505,8 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -4552,7 +4562,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
24, 98, 50, 0, 24, 98, 50, 0};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* dst_argb */
mov ecx, [esp + 8] /* width */
@@ -4608,9 +4618,9 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -4670,7 +4680,7 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
@@ -4717,10 +4727,10 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
+ uint32_t value) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
@@ -4752,9 +4762,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4792,9 +4802,9 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4841,9 +4851,9 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4871,9 +4881,9 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4909,9 +4919,9 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4939,9 +4949,9 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4972,10 +4982,10 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
// -1 0 1
// -2 0 2
// -1 0 1
-__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
__asm {
push esi
@@ -5030,9 +5040,9 @@ __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
// -1 -2 -1
// 0 0 0
// 1 2 1
-__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
__asm {
push esi
@@ -5084,9 +5094,9 @@ __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
// R = Sobel
// G = Sobel
// B = Sobel
-__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -5132,9 +5142,9 @@ __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
__asm {
push esi
@@ -5166,9 +5176,9 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
// R = Sobel X
// G = Sobel
// B = Sobel Y
-__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -5225,11 +5235,11 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
// count is number of averaged pixels to produce.
// Does 4 pixels at a time.
// This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft,
- const int32* botleft,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
int width,
int area,
- uint8* dst,
+ uint8_t* dst,
int count) {
__asm {
mov eax, topleft // eax topleft
@@ -5256,7 +5266,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
cvtps2dq xmm5, xmm5 // 0.16 fixed point
packssdw xmm5, xmm5 // 16 bit shorts
- // 4 pixel loop small blocks.
+ // 4 pixel loop small blocks.
s4:
// top left
movdqu xmm0, [eax]
@@ -5298,7 +5308,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
jmp l4b
- // 4 pixel loop
+ // 4 pixel loop
l4:
// top left
movdqu xmm0, [eax]
@@ -5350,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@@ -5375,9 +5385,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width) {
__asm {
mov eax, row
@@ -5392,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
test edx, 15
jne l4b
- // 4 pixel loop
+ // 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@@ -5438,9 +5448,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes.
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
@@ -5460,9 +5470,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width) {
__asm {
@@ -5481,7 +5491,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
sub ecx, 4
jl l4b
- // setup for 4 pixel loop
+ // setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@@ -5493,7 +5503,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
- // 4 pixel loop
+ // 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@@ -5524,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@@ -5546,8 +5556,8 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
@@ -5598,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop
jmp xloop99
- // Blend 50 / 50.
+ // Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
@@ -5608,7 +5618,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop50
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
+ // Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@@ -5623,8 +5633,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
// Bilinear filter 16x2 -> 16x1
// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
@@ -5638,7 +5648,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
- // Dispatch to specialized filters if applicable.
+ // Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
@@ -5678,7 +5688,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop
jmp xloop99
- // Blend 50 / 50.
+ // Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -5689,7 +5699,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop50
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
+ // Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
movdqu [esi + edi], xmm0
@@ -5705,9 +5715,9 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -5732,9 +5742,9 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
}
#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -5761,133 +5771,16 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width) {
- __asm {
- push ebx
- push esi
- mov eax, [esp + 8 + 4] // src_argb
- mov edx, [esp + 8 + 8] // dst_argb
- mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // width
- pxor xmm5, xmm5
-
- mov ebx, [esi] // shuffler
- cmp ebx, 0x03000102
- je shuf_3012
- cmp ebx, 0x00010203
- je shuf_0123
- cmp ebx, 0x00030201
- je shuf_0321
- cmp ebx, 0x02010003
- je shuf_2103
-
- // TODO(fbarchard): Use one source pointer and 3 offsets.
- shuf_any1:
- movzx ebx, byte ptr [esi]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx], bl
- movzx ebx, byte ptr [esi + 1]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 1], bl
- movzx ebx, byte ptr [esi + 2]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 2], bl
- movzx ebx, byte ptr [esi + 3]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 3], bl
- lea eax, [eax + 4]
- lea edx, [edx + 4]
- sub ecx, 1
- jg shuf_any1
- jmp shuf99
-
- shuf_0123:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
- pshuflw xmm0, xmm0, 01Bh
- pshufhw xmm1, xmm1, 01Bh
- pshuflw xmm1, xmm1, 01Bh
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0123
- jmp shuf99
-
- shuf_0321:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
- pshuflw xmm0, xmm0, 039h
- pshufhw xmm1, xmm1, 039h
- pshuflw xmm1, xmm1, 039h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0321
- jmp shuf99
-
- shuf_2103:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
- pshuflw xmm0, xmm0, 093h
- pshufhw xmm1, xmm1, 093h
- pshuflw xmm1, xmm1, 093h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_2103
- jmp shuf99
-
- shuf_3012:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
- pshuflw xmm0, xmm0, 0C6h
- pshufhw xmm1, xmm1, 0C6h
- pshuflw xmm1, xmm1, 0C6h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_3012
-
- shuf99:
- pop esi
- pop ebx
- ret
- }
-}
-
// YUY2 - Macro-pixel = 2 image pixels
// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
// UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1
-__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
__asm {
push esi
@@ -5921,10 +5814,10 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
}
}
-__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
__asm {
push esi
@@ -5959,8 +5852,8 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
}
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
__asm {
@@ -5971,7 +5864,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
- // 2 pixel loop.
+ // 2 pixel loop.
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
@@ -6018,8 +5911,8 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
__asm {
@@ -6058,8 +5951,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#ifdef HAS_HALFFLOATROW_SSE2
static float kExpBias = 1.9259299444e-34f;
-__declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6072,7 +5965,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
pxor xmm5, xmm5
sub edx, eax
- // 8 pixel loop.
+ // 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@@ -6095,8 +5988,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
-__declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6110,7 +6003,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpxor ymm5, ymm5, ymm5
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@@ -6133,8 +6026,8 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_HALFFLOATROW_F16C
-__declspec(naked) void HalfFloatRow_F16C(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6144,7 +6037,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov ecx, [esp + 16] /* width */
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
@@ -6167,8 +6060,8 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
- const uint8* table_argb,
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
int width) {
__asm {
push esi
@@ -6201,8 +6094,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
-__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
- const uint8* table_argb,
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
int width) {
__asm {
push esi
@@ -6233,11 +6126,11 @@ __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff) {
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
__asm {
push esi
push edi
@@ -6252,7 +6145,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
psllw xmm4, 8
pxor xmm5, xmm5
- // 4 pixel loop.
+ // 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3