diff options
Diffstat (limited to 'files/source')
48 files changed, 30484 insertions, 16567 deletions
diff --git a/files/source/compare.cc b/files/source/compare.cc index 1facd27b..5aa3a4db 100644 --- a/files/source/compare.cc +++ b/files/source/compare.cc @@ -29,10 +29,11 @@ extern "C" { // hash seed of 5381 recommended. LIBYUV_API -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; + uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = + HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { HashDjb2_SSE = HashDjb2_SSE41; @@ -44,7 +45,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { } #endif - while (count >= (uint64)(kBlockSize)) { + while (count >= (uint64_t)(kBlockSize)) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; @@ -62,7 +63,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { return seed; } -static uint32 ARGBDetectRow_C(const uint8* argb, int width) { +static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. @@ -93,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) { // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API -uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { - uint32 fourcc = 0; +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height) { + uint32_t fourcc = 0; int h; // Coalesce rows. @@ -110,20 +114,86 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); + uint64_t diff = 0; + int i; + uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, + int count) = HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + HammingDistance = HammingDistance_MMI; + } +#endif + +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & (kSimdSize - 1); + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + // TODO(fbarchard): Refactor into row function. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, - const uint8* src_b, - int count) { +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count) { // SumSquareError returns values 0 to 65535 for each squared difference. - // Up to 65536 of those can be summed and remain within a uint32. - // After each block of 65536 pixels, accumulate into a uint64. + // Up to 65536 of those can be summed and remain within a uint32_t. + // After each block of 65536 pixels, accumulate into a uint64_t. const int kBlockSize = 65536; int remainder = count & (kBlockSize - 1) & ~31; - uint64 sse = 0; + uint64_t sse = 0; int i; - uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = - SumSquareError_C; + uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, + int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; @@ -141,6 +211,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, SumSquareError = SumSquareError_AVX2; } #endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif +#if defined(HAS_SUMSQUAREERROR_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SumSquareError = SumSquareError_MMI; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif @@ -162,13 +242,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, } LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, - int stride_a, - const uint8* src_b, - int stride_b, - int width, - int height) { - uint64 sse = 0; +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + uint64_t sse = 0; int h; // Coalesce rows. if (stride_a == width && stride_b == width) { @@ -185,7 +265,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, } LIBYUV_API -double SumSquareErrorToPsnr(uint64 sse, uint64 count) { +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { double psnr; if (sse > 0) { double mse = (double)count / (double)sse; @@ -194,65 +274,67 @@ double SumSquareErrorToPsnr(uint64 sse, uint64 count) { psnr = kMaxPsnr; // Limit to prevent divide by 0 } - if (psnr > kMaxPsnr) + if (psnr > kMaxPsnr) { psnr = kMaxPsnr; + } return psnr; } LIBYUV_API -double CalcFramePsnr(const uint8* src_a, +double CalcFramePsnr(const uint8_t* src_a, int stride_a, - const uint8* src_b, + const uint8_t* src_b, int stride_b, int width, int height) { - const uint64 samples = width * height; - const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, - stride_b, width, height); + const uint64_t samples = (uint64_t)width * (uint64_t)height; + const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); return SumSquareErrorToPsnr(sse, samples); } LIBYUV_API -double I420Psnr(const uint8* src_y_a, +double I420Psnr(const uint8_t* src_y_a, int stride_y_a, - const uint8* src_u_a, + const uint8_t* src_u_a, int stride_u_a, - const uint8* src_v_a, + const uint8_t* src_v_a, int stride_v_a, - const uint8* src_y_b, + const uint8_t* src_y_b, int stride_y_b, - const uint8* src_u_b, + const uint8_t* src_u_b, int stride_u_b, - const uint8* src_v_b, + const uint8_t* src_v_b, int stride_v_b, int width, int height) { - const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b, - stride_y_b, width, height); + const uint64_t sse_y = ComputeSumSquareErrorPlane( + src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const uint64 sse_u = ComputeSumSquareErrorPlane( + const uint64_t sse_u = ComputeSumSquareErrorPlane( src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); - const uint64 sse_v = ComputeSumSquareErrorPlane( + const uint64_t sse_v = ComputeSumSquareErrorPlane( src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); - const uint64 samples = width * height + 2 * (width_uv * height_uv); - const uint64 sse = sse_y + sse_u + sse_v; + const uint64_t samples = (uint64_t)width * (uint64_t)height + + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); + const uint64_t sse = sse_y + sse_u + sse_v; return SumSquareErrorToPsnr(sse, samples); } -static const int64 cc1 = 26634; // (64^2*(.01*255)^2 -static const int64 cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static double Ssim8x8_C(const uint8* src_a, +static double Ssim8x8_C(const uint8_t* src_a, int stride_a, - const uint8* src_b, + const uint8_t* src_b, int stride_b) { - int64 sum_a = 0; - int64 sum_b = 0; - int64 sum_sq_a = 0; - int64 sum_sq_b = 0; - int64 sum_axb = 0; + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_sq_a = 0; + int64_t sum_sq_b = 0; + int64_t sum_axb = 0; int i; for (i = 0; i < 8; ++i) { @@ -270,20 +352,20 @@ static double Ssim8x8_C(const uint8* src_a, } { - const int64 count = 64; + const int64_t count = 64; // scale the constants by number of pixels - const int64 c1 = (cc1 * count * count) >> 12; - const int64 c2 = (cc2 * count * count) >> 12; + const int64_t c1 = (cc1 * count * count) >> 12; + const int64_t c2 = (cc2 * count * count) >> 12; - const int64 sum_a_x_sum_b = sum_a * sum_b; + const int64_t sum_a_x_sum_b = sum_a * sum_b; - const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * - (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); - const int64 sum_a_sq = sum_a * sum_a; - const int64 sum_b_sq = sum_b * sum_b; + const int64_t sum_a_sq = sum_a * sum_a; + const int64_t sum_b_sq = sum_b * sum_b; - const int64 ssim_d = + const int64_t ssim_d = (sum_a_sq + sum_b_sq + c1) * (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); @@ -298,15 +380,15 @@ static double Ssim8x8_C(const uint8* src_a, // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. LIBYUV_API -double CalcFrameSsim(const uint8* src_a, +double CalcFrameSsim(const uint8_t* src_a, int stride_a, - const uint8* src_b, + const uint8_t* src_b, int stride_b, int width, int height) { int samples = 0; double ssim_total = 0; - double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b, + double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b) = Ssim8x8_C; // sample point start with each 4x4 location @@ -327,17 +409,17 @@ double CalcFrameSsim(const uint8* src_a, } LIBYUV_API -double I420Ssim(const uint8* src_y_a, +double I420Ssim(const uint8_t* src_y_a, int stride_y_a, - const uint8* src_u_a, + const uint8_t* src_u_a, int stride_u_a, - const uint8* src_v_a, + const uint8_t* src_v_a, int stride_v_a, - const uint8* src_y_b, + const uint8_t* src_y_b, int stride_y_b, - const uint8* src_u_b, + const uint8_t* src_u_b, int stride_u_b, - const uint8* src_v_b, + const uint8_t* src_v_b, int stride_v_b, int width, int height) { diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc index 42fc5893..d4b170ad 100644 --- a/files/source/compare_common.cc +++ b/files/source/compare_common.cc @@ -17,20 +17,80 @@ namespace libyuv { extern "C" { #endif -uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse = 0u; +#if ORIGINAL_OPT +uint32_t HammingDistance_C1(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count; ++i) { + int x = src_a[i] ^ src_b[i]; + if (x & 1) + ++diff; + if (x & 2) + ++diff; + if (x & 4) + ++diff; + if (x & 8) + ++diff; + if (x & 16) + ++diff; + if (x & 32) + ++diff; + if (x & 64) + ++diff; + if (x & 128) + ++diff; + } + return diff; +} +#endif + +// Hakmem method for hamming distance. +uint32_t HammingDistance_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); + uint32_t u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); + src_a += 4; + src_b += 4; + } + + for (; i < count; ++i) { + uint32_t x = *src_a ^ *src_b; + uint32_t u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + + return diff; +} + +uint32_t SumSquareError_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; int i; for (i = 0; i < count; ++i) { int diff = src_a[i] - src_b[i]; - sse += (uint32)(diff * diff); + sse += (uint32_t)(diff * diff); } return sse; } // hash seed of 5381 recommended. // Internal C version of HashDjb2 with int sized count for efficiency. -uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { - uint32 hash = seed; +uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; int i; for (i = 0; i < count; ++i) { hash += (hash << 5) + src[i]; diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc index 64522aaa..676527c1 100644 --- a/files/source/compare_gcc.cc +++ b/files/source/compare_gcc.cc @@ -22,124 +22,334 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse; - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10, 1) ",%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +#if defined(__x86_64__) +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint64_t diff = 0u; + + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" + + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return static_cast<uint32_t>(diff); +} +#else +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32_t HammingDistance_SSSE3(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32_t HammingDistance_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + +uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); return sse; } -static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 -static uvec32 kHashMul0 = { +static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +static const uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 0xa3476dc1, // 33 ^ 14 0x3b4039a1, // 33 ^ 13 0x4f5f0981, // 33 ^ 12 }; -static uvec32 kHashMul1 = { +static const uvec32 kHashMul1 = { 0x30f35d61, // 33 ^ 11 0x855cb541, // 33 ^ 10 0x040a9121, // 33 ^ 9 0x747c7101, // 33 ^ 8 }; -static uvec32 kHashMul2 = { +static const uvec32 kHashMul2 = { 0xec41d4e1, // 33 ^ 7 0x4cfa3cc1, // 33 ^ 6 0x025528a1, // 33 ^ 5 0x00121881, // 33 ^ 4 }; -static uvec32 kHashMul3 = { +static const uvec32 kHashMul3 = { 0x00008c61, // 33 ^ 3 0x00000441, // 33 ^ 2 0x00000021, // 33 ^ 1 0x00000001, // 33 ^ 0 }; -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { - uint32 hash; - asm volatile ( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=g"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash; + asm volatile( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); return hash; } #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc new file mode 100644 index 00000000..7640d946 --- /dev/null +++ b/files/source/compare_mmi.cc @@ -0,0 +1,123 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Mips MMI. +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +// Hakmem method for hamming distance. +uint32_t HammingDistance_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; + uint64_t c1 = 0x5555555555555555; + uint64_t c2 = 0x3333333333333333; + uint64_t c3 = 0x0f0f0f0f0f0f0f0f; + uint32_t c4 = 0x01010101; + uint64_t s1 = 1, s2 = 2, s3 = 4; + __asm__ volatile( + "1: \n\t" + "ldc1 %[ta], 0(%[src_a]) \n\t" + "ldc1 %[tb], 0(%[src_b]) \n\t" + "xor %[temp], %[ta], %[tb] \n\t" + "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 + "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 + "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 + "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) + "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 + "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 + "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t + "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 + "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) + "and %[temp1], %[temp1], %[c3] \n\t" //&c3 + "dmfc1 $t0, %[temp1] \n\t" + "dsrl32 $t0, $t0, 0 \n\t " + "mul $t0, $t0, %[c4] \n\t" + "dsrl $t0, $t0, 24 \n\t" + "dadd %[diff], %[diff], $t0 \n\t" + "dmfc1 $t0, %[temp1] \n\t" + "mul $t0, $t0, %[c4] \n\t" + "dsrl $t0, $t0, 24 \n\t" + "dadd %[diff], %[diff], $t0 \n\t" + "daddiu %[src_a], %[src_a], 8 \n\t" + "daddiu %[src_b], %[src_b], 8 \n\t" + "addiu %[count], %[count], -8 \n\t" + "bgtz %[count], 1b \n\t" + "nop \n\t" + : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), + [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), + [temp1] "+f"(temp1) + : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), + [s2] "f"(s2), [s3] "f"(s3) + : "memory"); + return diff; +} + +uint32_t SumSquareError_MMI(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + uint32_t sse_hi = 0u, sse_lo = 0u; + + uint64_t src1, src2; + uint64_t diff, diff_hi, diff_lo; + uint64_t sse_sum, sse_tmp; + + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" + + "1: \n\t" + "ldc1 %[src1], 0x00(%[src_a]) \n\t" + "ldc1 %[src2], 0x00(%[src_b]) \n\t" + "pasubub %[diff], %[src1], %[src2] \n\t" + "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" + "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" + "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" + "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" + "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" + "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" + + "daddiu %[src_a], %[src_a], 0x08 \n\t" + "daddiu %[src_b], %[src_b], 0x08 \n\t" + "daddiu %[count], %[count], -0x08 \n\t" + "bnez %[count], 1b \n\t" + + "mfc1 %[sse_lo], %[sse_sum] \n\t" + "mfhc1 %[sse_hi], %[sse_sum] \n\t" + "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" + : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), + [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), + [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), + [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) + : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), + [mask] "f"(mask) + : "memory"); + + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/compare_msa.cc b/files/source/compare_msa.cc new file mode 100644 index 00000000..0b807d37 --- /dev/null +++ b/files/source/compare_msa.cc @@ -0,0 +1,97 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32_t HammingDistance_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +uint32_t SumSquareError_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc index 49aa3b4e..2a2181e0 100644 --- a/files/source/compare_neon.cc +++ b/files/source/compare_neon.cc @@ -21,40 +21,70 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" + + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); return sse; } diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc index f9c7df98..6e8f672a 100644 --- a/files/source/compare_neon64.cc +++ b/files/source/compare_neon64.cc @@ -20,39 +20,65 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.8h, #0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc index b17fc8e1..d57d3d9d 100644 --- a/files/source/compare_win.cc +++ b/files/source/compare_win.cc @@ -13,16 +13,35 @@ #include "libyuv/compare_row.h" #include "libyuv/row.h" +#if defined(_MSC_VER) +#include <intrin.h> // For __popcnt +#endif + #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT + src_a += 4; + src_b += 4; + diff += __popcnt(x); + } + return diff; +} -__declspec(naked) uint32 - SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { +__declspec(naked) uint32_t + SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b @@ -62,8 +81,8 @@ __declspec(naked) uint32 #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable : 4752) -__declspec(naked) uint32 - SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { +__declspec(naked) uint32_t + SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b @@ -127,8 +146,8 @@ uvec32 kHashMul3 = { 0x00000001, // 33 ^ 0 }; -__declspec(naked) uint32 - HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count @@ -178,8 +197,8 @@ __declspec(naked) uint32 // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) uint32 - HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count diff --git a/files/source/convert.cc b/files/source/convert.cc index dfa83a5a..614fa482 100644 --- a/files/source/convert.cc +++ b/files/source/convert.cc @@ -28,17 +28,17 @@ static __inline int Abs(int v) { } // Any I4xx To I420 format with mirroring. -static int I4xxToI420(const uint8* src_y, +static int I4xxToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int src_y_width, int src_y_height, @@ -62,21 +62,21 @@ static int I4xxToI420(const uint8* src_y, return 0; } -// Copy I420 with optional flipping +// Copy I420 with optional flipping. // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure // is does row coalescing. LIBYUV_API -int I420Copy(const uint8* src_y, +int I420Copy(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -106,20 +106,106 @@ int I420Copy(const uint8* src_y, return 0; } +// Copy I010 with optional flipping. +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Convert 10 bit YUV to 8 bit. +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, + halfheight); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, + halfheight); + return 0; +} + // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I422ToI420(const uint8* src_y, +int I422ToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -129,20 +215,209 @@ int I422ToI420(const uint8* src_y, dst_v, dst_stride_v, width, height, src_uv_width, height); } +// TODO(fbarchard): Implement row conversion. +LIBYUV_API +int I422ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Allocate u and v buffers + align_buffer_64(plane_u, halfwidth * halfheight * 2); + uint8_t* plane_v = plane_u + halfwidth * halfheight; + + I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, + height); + MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, + halfwidth, halfheight); + free_aligned_buffer_64(plane_u); + return 0; +} + +#ifdef I422TONV21_ROW_VERSION +// Unittest fails for this version. +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +// Swap src_u and src_v to implement I422ToNV12 +LIBYUV_API +int I422ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow = MergeUVRow_MMI; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); + } + { + // Allocate 2 rows of vu. + int awidth = halfwidth * 2; + align_buffer_64(row_vu_0, awidth * 2); + uint8_t* row_vu_1 = row_vu_0 + awidth; + + for (y = 0; y < height - 1; y += 2) { + MergeUVRow(src_v, src_u, row_vu_0, halfwidth); + MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1, + halfwidth); + InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + MergeUVRow(src_v, src_u, dst_vu, halfwidth); + } + free_aligned_buffer_64(row_vu_0); + } + return 0; +} +#endif // I422TONV21_ROW_VERSION + // 444 chroma is 1x width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I444ToI420(const uint8* src_y, +int I444ToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -151,15 +426,55 @@ int I444ToI420(const uint8* src_y, dst_v, dst_stride_v, width, height, width, height); } +// TODO(fbarchard): Implement row conversion. +LIBYUV_API +int I444ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + // Allocate u and v buffers + align_buffer_64(plane_u, halfwidth * halfheight * 2); + uint8_t* plane_v = plane_u + halfwidth * halfheight; + + I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, + height); + MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, + halfwidth, halfheight); + free_aligned_buffer_64(plane_u); + return 0; +} + // I400 is greyscale typically used in MJPG LIBYUV_API -int I400ToI420(const uint8* src_y, +int I400ToI420(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -183,15 +498,44 @@ int I400ToI420(const uint8* src_y, return 0; } -static void CopyPlane2(const uint8* src, +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToNV21(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8_t* src, int src_stride_0, int src_stride_1, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -212,11 +556,6 @@ static void CopyPlane2(const uint8* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height - 1; y += 2) { @@ -239,16 +578,16 @@ static void CopyPlane2(const uint8* src, // src_stride_m420 is row planar. Normally this will be the width in pixels. // The UV plane is half width, but 2 values, so src_stride_m420 applies to // this as well as the two Y planes. -static int X420ToI420(const uint8* src_y, +static int X420ToI420(const uint8_t* src_y, int src_stride_y0, int src_stride_y1, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -303,15 +642,15 @@ static int X420ToI420(const uint8* src_y, // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, +int NV12ToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -322,15 +661,15 @@ int NV12ToI420(const uint8* src_y, // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV21ToI420(const uint8* src_y, +int NV21ToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_vu, + const uint8_t* src_vu, int src_stride_vu, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -341,13 +680,13 @@ int NV21ToI420(const uint8* src_y, // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, +int M420ToI420(const uint8_t* src_m420, int src_stride_m420, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -359,20 +698,21 @@ int M420ToI420(const uint8* src_m420, // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, +int YUY2ToI420(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, - uint8* dst_v, int width) = YUY2ToUVRow_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, + uint8_t* dst_u, uint8_t* dst_v, int width) = + YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { @@ -420,6 +760,18 @@ int YUY2ToI420(const uint8* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + YUY2ToUVRow = YUY2ToUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_MMI; + } + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -439,20 +791,21 @@ int YUY2ToI420(const uint8* src_yuy2, // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, +int UYVYToI420(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, - uint8* dst_v, int width) = UYVYToUVRow_C; - void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) = + void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, + uint8_t* dst_u, uint8_t* dst_v, int width) = + UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { @@ -500,6 +853,16 @@ int UYVYToI420(const uint8* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + UYVYToYRow = UYVYToYRow_Any_MMI; + UYVYToUVRow = UYVYToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_MMI; + UYVYToUVRow = UYVYToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -517,22 +880,161 @@ int UYVYToI420(const uint8* src_uyvy, return 0; } +// Convert AYUV to NV12. +LIBYUV_API +int AYUVToNV12(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_uv, int width) = AYUVToUVRow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToUVRow = AYUVToUVRow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToUVRow = AYUVToUVRow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToUVRow = AYUVToUVRow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToUVRow = AYUVToUVRow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToUVRow = AYUVToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToUVRow = AYUVToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + AYUVToUVRow(src_ayuv, 0, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + +// Convert AYUV to NV21. +LIBYUV_API +int AYUVToNV21(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_vu, int width) = AYUVToVURow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToVURow = AYUVToVURow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToVURow = AYUVToVURow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToVURow = AYUVToVURow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToVURow = AYUVToVURow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToVURow = AYUVToVURow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToVURow = AYUVToVURow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + AYUVToVURow(src_ayuv, 0, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + // Convert ARGB to I420. LIBYUV_API -int ARGBToI420(const uint8* src_argb, +int ARGBToI420(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -579,14 +1081,6 @@ int ARGBToI420(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -595,14 +1089,6 @@ int ARGBToI420(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; - } - } -#endif #if defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVRow = ARGBToUVRow_Any_MSA; @@ -611,6 +1097,22 @@ int ARGBToI420(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -630,20 +1132,21 @@ int ARGBToI420(const uint8* src_argb, // Convert BGRA to I420. LIBYUV_API -int BGRAToI420(const uint8* src_bgra, +int BGRAToI420(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u, - uint8* dst_v, int width) = BGRAToUVRow_C; - void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) = + void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, + uint8_t* dst_u, uint8_t* dst_v, int width) = + BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = BGRAToYRow_C; if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -680,22 +1183,6 @@ int BGRAToI420(const uint8* src_bgra, } } #endif -#if defined(HAS_BGRATOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - BGRAToYRow = BGRAToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - BGRAToYRow = BGRAToYRow_DSPR2; - } - } -#endif -#if defined(HAS_BGRATOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - BGRAToUVRow = BGRAToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_DSPR2; - } - } -#endif #if defined(HAS_BGRATOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { BGRAToYRow = BGRAToYRow_Any_MSA; @@ -712,6 +1199,22 @@ int BGRAToI420(const uint8* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BGRAToYRow = BGRAToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_MMI; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BGRAToUVRow = BGRAToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); @@ -731,20 +1234,21 @@ int BGRAToI420(const uint8* src_bgra, // Convert ABGR to I420. LIBYUV_API -int ABGRToI420(const uint8* src_abgr, +int ABGRToI420(const uint8_t* src_abgr, int src_stride_abgr, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u, - uint8* dst_v, int width) = ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) = + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -781,22 +1285,6 @@ int ABGRToI420(const uint8* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ABGRToYRow = ABGRToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_DSPR2; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ABGRToUVRow = ABGRToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_DSPR2; - } - } -#endif #if defined(HAS_ABGRTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -813,6 +1301,22 @@ int ABGRToI420(const uint8* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToYRow = ABGRToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_MMI; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToUVRow = ABGRToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -832,20 +1336,21 @@ int ABGRToI420(const uint8* src_abgr, // Convert RGBA to I420. LIBYUV_API -int RGBAToI420(const uint8* src_rgba, +int RGBAToI420(const uint8_t* src_rgba, int src_stride_rgba, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u, - uint8* dst_v, int width) = RGBAToUVRow_C; - void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) = + void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = RGBAToYRow_C; if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -882,22 +1387,6 @@ int RGBAToI420(const uint8* src_rgba, } } #endif -#if defined(HAS_RGBATOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RGBAToYRow = RGBAToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - RGBAToYRow = RGBAToYRow_DSPR2; - } - } -#endif -#if defined(HAS_RGBATOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RGBAToUVRow = RGBAToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_DSPR2; - } - } -#endif #if defined(HAS_RGBATOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; @@ -914,6 +1403,22 @@ int RGBAToI420(const uint8* src_rgba, } } #endif +#if defined(HAS_RGBATOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGBAToYRow = RGBAToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_MMI; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGBAToUVRow = RGBAToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -933,28 +1438,31 @@ int RGBAToI420(const uint8* src_rgba, // Convert RGB24 to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_rgb24, +int RGB24ToI420(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) - void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; - void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) = +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_MMI)) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = RGB24ToYRow_C; #else - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -988,6 +1496,17 @@ int RGB24ToI420(const uint8* src_rgb24, RGB24ToUVRow = RGB24ToUVRow_MSA; } } +#elif defined(HAS_RGB24TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MMI; + RGB24ToYRow = RGB24ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -1021,14 +1540,16 @@ int RGB24ToI420(const uint8* src_rgb24, #endif { -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_MMI)) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1045,7 +1566,8 @@ int RGB24ToI420(const uint8* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_MMI)) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else @@ -1054,7 +1576,157 @@ int RGB24ToI420(const uint8* src_rgb24, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_MMI)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// TODO(fbarchard): Use Matrix version to implement I420 and J420. +// Convert RGB24 to J420. +LIBYUV_API +int RGB24ToJ420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVJRow_C; + void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = + RGB24ToYJRow_C; +#else + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; + RGB24ToYJRow = RGB24ToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYJRow = RGB24ToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVJRow = RGB24ToUVJRow_NEON; + } + } + } +#elif defined(HAS_RGB24TOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_MSA; + RGB24ToUVJRow = RGB24ToUVJRow_MSA; + } + } +#elif defined(HAS_RGB24TOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI; + RGB24ToYJRow = RGB24ToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB24ToYJRow = RGB24ToYJRow_MMI; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVJRow = RGB24ToUVJRow_MMI; + } + } + } +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#endif + + { +#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); + RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1063,28 +1735,30 @@ int RGB24ToI420(const uint8* src_rgb24, // Convert RAW to I420. LIBYUV_API -int RAWToI420(const uint8* src_raw, +int RAWToI420(const uint8_t* src_raw, int src_stride_raw, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) - void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u, - uint8* dst_v, int width) = RAWToUVRow_C; - void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) = +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_MMI)) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYRow_C; #else - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1118,6 +1792,17 @@ int RAWToI420(const uint8* src_raw, RAWToUVRow = RAWToUVRow_MSA; } } +#elif defined(HAS_RAWTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToUVRow = RAWToUVRow_Any_MMI; + RAWToYRow = RAWToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -1151,14 +1836,16 @@ int RAWToI420(const uint8* src_raw, #endif { -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_MMI)) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); @@ -1175,7 +1862,8 @@ int RAWToI420(const uint8* src_raw, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_MMI)) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else @@ -1184,7 +1872,8 @@ int RAWToI420(const uint8* src_raw, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1193,29 +1882,31 @@ int RAWToI420(const uint8* src_raw, // Convert RGB565 to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_rgb565, +int RGB565ToI420(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) - void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) = +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_MMI)) + void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, + uint8_t* dst_u, uint8_t* dst_v, int width) = RGB565ToUVRow_C; - void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) = + void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = RGB565ToYRow_C; #else - void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -1249,6 +1940,17 @@ int RGB565ToI420(const uint8* src_rgb565, RGB565ToUVRow = RGB565ToUVRow_MSA; } } +#elif defined(HAS_RGB565TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MMI; + RGB565ToYRow = RGB565ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -1287,23 +1989,17 @@ int RGB565ToI420(const uint8* src_rgb565, } } #endif -#if defined(HAS_RGB565TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_DSPR2; - } - } -#endif #endif { -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -1320,7 +2016,8 @@ int RGB565ToI420(const uint8* src_rgb565, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -1329,7 +2026,8 @@ int RGB565ToI420(const uint8* src_rgb565, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ + defined(HAS_RGB565TOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1338,29 +2036,31 @@ int RGB565ToI420(const uint8* src_rgb565, // Convert ARGB1555 to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_argb1555, +int ARGB1555ToI420(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) - void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) = +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_MMI)) + void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, + uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB1555ToUVRow_C; - void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) = - ARGB1555ToYRow_C; + void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width) = ARGB1555ToYRow_C; #else - void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB1555ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || @@ -1395,6 +2095,17 @@ int ARGB1555ToI420(const uint8* src_argb1555, ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; } } +#elif defined(HAS_ARGB1555TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -1435,14 +2146,16 @@ int ARGB1555ToI420(const uint8* src_argb1555, #endif #endif { -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -1461,7 +2174,8 @@ int ARGB1555ToI420(const uint8* src_argb1555, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -1470,7 +2184,8 @@ int ARGB1555ToI420(const uint8* src_argb1555, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ + defined(HAS_ARGB1555TOYROW_MMI)) free_aligned_buffer_64(row); #endif } @@ -1479,29 +2194,30 @@ int ARGB1555ToI420(const uint8* src_argb1555, // Convert ARGB4444 to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_argb4444, +int ARGB4444ToI420(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; -#if defined(HAS_ARGB4444TOYROW_NEON) - void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) = +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) + void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, + uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB4444ToUVRow_C; - void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) = - ARGB4444ToYRow_C; + void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width) = ARGB4444ToYRow_C; #else - void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB4444ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || @@ -1527,6 +2243,17 @@ int ARGB4444ToI420(const uint8* src_argb4444, } } } +#elif defined(HAS_ARGB4444TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; + ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_MMI; + } + } + } // Other platforms do intermediate conversion from ARGB4444 to ARGB. #else #if defined(HAS_ARGB4444TOARGBROW_SSE2) @@ -1585,17 +2312,29 @@ int ARGB4444ToI420(const uint8* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } + } +#endif #endif { -#if !defined(HAS_ARGB4444TOYROW_NEON) +#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_ARGB4444TOYROW_NEON) +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, @@ -1614,7 +2353,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_ARGB4444TOYROW_NEON) +#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); #else @@ -1623,16 +2362,134 @@ int ARGB4444ToI420(const uint8* src_argb4444, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_ARGB4444TOYROW_NEON) +#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) free_aligned_buffer_64(row); #endif } return 0; } -static void SplitPixels(const uint8* src_u, +// Convert RGB24 to J400. +LIBYUV_API +int RGB24ToJ400(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = + RGB24ToYJRow_C; +#else + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RGB24ToARGBRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; +#endif + if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToYJRow = RGB24ToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYJRow = RGB24ToYJRow_NEON; + } + } +#elif defined(HAS_RGB24TOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToYJRow = RGB24ToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_MSA; + } + } +#elif defined(HAS_RGB24TOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToYJRow = RGB24ToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RGB24ToYJRow = RGB24ToYJRow_MMI; + } + } +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#endif + + { +#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + RGB24ToYJRow(src_rgb24, dst_yj, width); + RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToYJRow(row, dst_yj, width); + ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_yj += dst_stride_yj * 2; + } + if (height & 1) { +#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + RGB24ToYJRow(src_rgb24, dst_yj, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToYJRow(row, dst_yj, width); +#endif + } +#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_MMI)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +static void SplitPixels(const uint8_t* src_u, int src_pixel_stride_uv, - uint8* dst_u, + uint8_t* dst_u, int width) { int i; for (i = 0; i < width; ++i) { @@ -1644,18 +2501,18 @@ static void SplitPixels(const uint8* src_u, // Convert Android420 to I420. LIBYUV_API -int Android420ToI420(const uint8* src_y, +int Android420ToI420(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -1688,14 +2545,15 @@ int Android420ToI420(const uint8* src_y, CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; // Split UV planes - NV21 - } else if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, halfwidth, halfheight); return 0; // Split UV planes - NV12 - } else if (src_pixel_stride_uv == 2 && vu_off == 1 && - src_stride_u == src_stride_v) { + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc index 983be578..54050333 100644 --- a/files/source/convert_argb.cc +++ b/files/source/convert_argb.cc @@ -26,9 +26,9 @@ extern "C" { // Copy ARGB with optional flipping LIBYUV_API -int ARGBCopy(const uint8* src_argb, +int ARGBCopy(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -47,21 +47,21 @@ int ARGBCopy(const uint8* src_argb, return 0; } -// Convert I422 to ARGB with matrix -static int I420ToARGBMatrix(const uint8* src_y, +// Convert I420 to ARGB with matrix +static int I420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { @@ -97,15 +97,6 @@ static int I420ToARGBMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; - } -#endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; @@ -129,13 +120,13 @@ static int I420ToARGBMatrix(const uint8* src_y, // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, +int I420ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -146,13 +137,13 @@ int I420ToARGB(const uint8* src_y, // Convert I420 to ABGR. LIBYUV_API -int I420ToABGR(const uint8* src_y, +int I420ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -165,13 +156,13 @@ int I420ToABGR(const uint8* src_y, // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, +int J420ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -182,13 +173,13 @@ int J420ToARGB(const uint8* src_y, // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, +int J420ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -201,13 +192,13 @@ int J420ToABGR(const uint8* src_y, // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, +int H420ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -218,13 +209,13 @@ int H420ToARGB(const uint8* src_y, // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, +int H420ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -236,20 +227,20 @@ int H420ToABGR(const uint8* src_y, } // Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8* src_y, +static int I422ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { @@ -292,15 +283,6 @@ static int I422ToARGBMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; - } -#endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; @@ -322,13 +304,13 @@ static int I422ToARGBMatrix(const uint8* src_y, // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, +int I422ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -339,13 +321,13 @@ int I422ToARGB(const uint8* src_y, // Convert I422 to ABGR. LIBYUV_API -int I422ToABGR(const uint8* src_y, +int I422ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -358,13 +340,13 @@ int I422ToABGR(const uint8* src_y, // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, +int J422ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -375,13 +357,13 @@ int J422ToARGB(const uint8* src_y, // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, +int J422ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -394,13 +376,13 @@ int J422ToABGR(const uint8* src_y, // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, +int H422ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -411,13 +393,13 @@ int H422ToARGB(const uint8* src_y, // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, +int H422ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -428,21 +410,271 @@ int H422ToABGR(const uint8* src_y, width, height); } +// Convert 10 bit YUV to ARGB with matrix +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +static int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +static int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + // Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8* src_y, +static int I444ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I444ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { @@ -485,14 +717,6 @@ static int I444ToARGBMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I444TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - I444ToARGBRow = I444ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_I444TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I444ToARGBRow = I444ToARGBRow_Any_MSA; @@ -514,13 +738,13 @@ static int I444ToARGBMatrix(const uint8* src_y, // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, +int I444ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -531,13 +755,13 @@ int I444ToARGB(const uint8* src_y, // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, +int I444ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -550,13 +774,13 @@ int I444ToABGR(const uint8* src_y, // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, +int J444ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -566,28 +790,28 @@ int J444ToARGB(const uint8* src_y, } // Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8* src_y, +static int I420AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - const uint8* src_a, + const uint8_t* src_a, int src_stride_a, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; - void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, const uint8* a_buf, - uint8* dst_argb, + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBAttenuateRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -621,15 +845,6 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I422ALPHATOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; - } -#endif #if defined(HAS_I422ALPHATOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; @@ -670,6 +885,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -690,15 +913,15 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, // Convert I420 with Alpha to ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, +int I420AlphaToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - const uint8* src_a, + const uint8_t* src_a, int src_stride_a, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height, @@ -711,15 +934,15 @@ int I420AlphaToARGB(const uint8* src_y, // Convert I420 with Alpha to ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, +int I420AlphaToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - const uint8* src_a, + const uint8_t* src_a, int src_stride_a, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, @@ -733,14 +956,14 @@ int I420AlphaToABGR(const uint8* src_y, // Convert I400 to ARGB. LIBYUV_API -int I400ToARGB(const uint8* src_y, +int I400ToARGB(const uint8_t* src_y, int src_stride_y, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) = + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -789,6 +1012,14 @@ int I400ToARGB(const uint8* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I400ToARGBRow = I400ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); @@ -800,14 +1031,14 @@ int I400ToARGB(const uint8* src_y, // Convert J400 to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, +int J400ToARGB(const uint8_t* src_y, int src_stride_y, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) = + void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = J400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -856,6 +1087,14 @@ int J400ToARGB(const uint8* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + J400ToARGBRow = J400ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + J400ToARGBRow = J400ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; @@ -865,87 +1104,87 @@ int J400ToARGB(const uint8* src_y, } // Shuffle table for converting BGRA to ARGB. -static uvec8 kShuffleMaskBGRAToARGB = {3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, - 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; // Shuffle table for converting ABGR to ARGB. -static uvec8 kShuffleMaskABGRToARGB = {2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, - 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; // Shuffle table for converting RGBA to ARGB. -static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, - 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; // Convert BGRA to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_bgra, +int BGRAToARGB(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). LIBYUV_API -int ARGBToBGRA(const uint8* src_bgra, +int ARGBToBGRA(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ABGR to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_abgr, +int ABGRToARGB(const uint8_t* src_abgr, int src_stride_abgr, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). LIBYUV_API -int ARGBToABGR(const uint8* src_abgr, +int ARGBToABGR(const uint8_t* src_abgr, int src_stride_abgr, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert RGBA to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_rgba, +int RGBAToARGB(const uint8_t* src_rgba, int src_stride_rgba, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskRGBAToARGB), width, height); + (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); } // Convert RGB24 to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_rgb24, +int RGB24ToARGB(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -978,14 +1217,6 @@ int RGB24ToARGB(const uint8* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_RGB24TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; @@ -994,6 +1225,14 @@ int RGB24ToARGB(const uint8* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RGB24ToARGBRow = RGB24ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -1005,14 +1244,14 @@ int RGB24ToARGB(const uint8* src_rgb24, // Convert RAW to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_raw, +int RAWToARGB(const uint8_t* src_raw, int src_stride_raw, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; if (!src_raw || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1045,14 +1284,6 @@ int RAWToARGB(const uint8* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RAWToARGBRow = RAWToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_RAWTOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToARGBRow = RAWToARGBRow_Any_MSA; @@ -1061,6 +1292,14 @@ int RAWToARGB(const uint8* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToARGBRow = RAWToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RAWToARGBRow = RAWToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -1072,15 +1311,15 @@ int RAWToARGB(const uint8* src_raw, // Convert RGB565 to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_rgb565, +int RGB565ToARGB(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; + void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1120,14 +1359,6 @@ int RGB565ToARGB(const uint8* src_rgb565, } } #endif -#if defined(HAS_RGB565TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_RGB565TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; @@ -1136,6 +1367,14 @@ int RGB565ToARGB(const uint8* src_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RGB565ToARGBRow = RGB565ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -1147,14 +1386,14 @@ int RGB565ToARGB(const uint8* src_rgb565, // Convert ARGB1555 to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_argb1555, +int ARGB1555ToARGB(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) = ARGB1555ToARGBRow_C; if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1195,14 +1434,6 @@ int ARGB1555ToARGB(const uint8* src_argb1555, } } #endif -#if defined(HAS_ARGB1555TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 4)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_ARGB1555TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; @@ -1211,6 +1442,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -1222,14 +1461,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, // Convert ARGB4444 to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_argb4444, +int ARGB4444ToARGB(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) = ARGB4444ToARGBRow_C; if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1270,14 +1509,6 @@ int ARGB4444ToARGB(const uint8* src_argb4444, } } #endif -#if defined(HAS_ARGB4444TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 4)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_ARGB4444TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; @@ -1286,6 +1517,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -1295,20 +1534,116 @@ int ARGB4444ToARGB(const uint8* src_argb4444, return 0; } +// Convert AR30 to ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_argb = 0; + } + for (y = 0; y < height; ++y) { + AR30ToARGBRow_C(src_ar30, dst_argb, width); + src_ar30 += src_stride_ar30; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + int y; + if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_abgr = 0; + } + for (y = 0; y < height; ++y) { + AR30ToABGRRow_C(src_ar30, dst_abgr, width); + src_ar30 += src_stride_ar30; + dst_abgr += dst_stride_abgr; + } + return 0; +} + +// Convert AR30 to AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + int y; + if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_ab30 = 0; + } + for (y = 0; y < height; ++y) { + AR30ToAB30Row_C(src_ar30, dst_ab30, width); + src_ar30 += src_stride_ar30; + dst_ab30 += dst_stride_ab30; + } + return 0; +} + // Convert NV12 to ARGB with matrix -static int NV12ToARGBMatrix(const uint8* src_y, +static int NV12ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - NV12ToARGBRow_C; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1342,14 +1677,6 @@ static int NV12ToARGBMatrix(const uint8* src_y, } } #endif -#if defined(HAS_NV12TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_NV12TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { NV12ToARGBRow = NV12ToARGBRow_Any_MSA; @@ -1371,20 +1698,20 @@ static int NV12ToARGBMatrix(const uint8* src_y, } // Convert NV21 to ARGB with matrix -static int NV21ToARGBMatrix(const uint8* src_y, +static int NV21ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, - int src_stride_uv, - uint8* dst_argb, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - NV21ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + void (*NV21ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1427,11 +1754,11 @@ static int NV21ToARGBMatrix(const uint8* src_y, #endif for (y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { - src_uv += src_stride_uv; + src_vu += src_stride_vu; } } return 0; @@ -1439,11 +1766,11 @@ static int NV21ToARGBMatrix(const uint8* src_y, // Convert NV12 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, +int NV12ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -1453,15 +1780,15 @@ int NV12ToARGB(const uint8* src_y, // Convert NV21 to ARGB. LIBYUV_API -int NV21ToARGB(const uint8* src_y, +int NV21ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, - int src_stride_uv, - uint8* dst_argb, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { - return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, + return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } @@ -1469,11 +1796,11 @@ int NV21ToARGB(const uint8* src_y, // To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix. // To swap the UV use NV12 instead of NV21.LIBYUV_API LIBYUV_API -int NV12ToABGR(const uint8* src_y, +int NV12ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -1483,11 +1810,11 @@ int NV12ToABGR(const uint8* src_y, // Convert NV21 to ABGR. LIBYUV_API -int NV21ToABGR(const uint8* src_y, +int NV21ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_vu, + const uint8_t* src_vu, int src_stride_vu, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { @@ -1495,18 +1822,243 @@ int NV21ToABGR(const uint8* src_y, dst_stride_abgr, &kYvuI601Constants, width, height); } +// TODO(fbarchard): Consider SSSE3 2 step conversion. +// Convert NV12 to RGB24 with matrix +static int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV12TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB24Row = NV12ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV12ToRGB24Row = NV12ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB24 with matrix +static int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV21TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB24Row = NV21ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToRGB24Row = NV21ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV12 to RAW. +LIBYUV_API +int NV12ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); +} + +// Convert NV21 to RAW. +LIBYUV_API +int NV21ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw, + dst_stride_raw, &kYvuI601Constants, width, height); +} + +// Convert NV21 to YUV24 +int NV21ToYUV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_yuv24, + int dst_stride_yuv24, + int width, + int height) { + int y; + void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu, + uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C; + if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24; + dst_stride_yuv24 = -dst_stride_yuv24; + } +#if defined(HAS_NV21TOYUV24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TOYUV24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToYUV24Row = NV21ToYUV24Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width); + dst_yuv24 += dst_stride_yuv24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, +int M420ToARGB(const uint8_t* src_m420, int src_stride_m420, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - NV12ToARGBRow_C; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; if (!src_m420 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1540,14 +2092,6 @@ int M420ToARGB(const uint8* src_m420, } } #endif -#if defined(HAS_NV12TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_DSPR2; - } - } -#endif #if defined(HAS_NV12TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { NV12ToARGBRow = NV12ToARGBRow_Any_MSA; @@ -1574,14 +2118,14 @@ int M420ToARGB(const uint8* src_m420, // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, +int YUY2ToARGB(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, + void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = YUY2ToARGBRow_C; if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { @@ -1641,14 +2185,14 @@ int YUY2ToARGB(const uint8* src_yuy2, // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, +int UYVYToARGB(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, + void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = UYVYToARGBRow_C; if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { @@ -1705,10 +2249,10 @@ int UYVYToARGB(const uint8* src_uyvy, } return 0; } -static void WeavePixels(const uint8* src_u, - const uint8* src_v, +static void WeavePixels(const uint8_t* src_u, + const uint8_t* src_v, int src_pixel_stride_uv, - uint8* dst_uv, + uint8_t* dst_uv, int width) { int i; for (i = 0; i < width; ++i) { @@ -1722,20 +2266,20 @@ static void WeavePixels(const uint8* src_u, // Convert Android420 to ARGB. LIBYUV_API -int Android420ToARGBMatrix(const uint8* src_y, +int Android420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; - uint8* dst_uv; + uint8_t* dst_uv; const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; @@ -1756,13 +2300,14 @@ int Android420ToARGBMatrix(const uint8* src_y, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); // NV21 - } else if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); // NV12 - } else if (src_pixel_stride_uv == 2 && vu_off == 1 && - src_stride_u == src_stride_v) { + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, dst_stride_argb, yuvconstants, width, height); } @@ -1784,14 +2329,14 @@ int Android420ToARGBMatrix(const uint8* src_y, // Convert Android420 to ARGB. LIBYUV_API -int Android420ToARGB(const uint8* src_y, +int Android420ToARGB(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { @@ -1803,14 +2348,14 @@ int Android420ToARGB(const uint8* src_y, // Convert Android420 to ABGR. LIBYUV_API -int Android420ToABGR(const uint8* src_y, +int Android420ToABGR(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, - uint8* dst_abgr, + uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc index e0ebfb08..60140cb4 100644 --- a/files/source/convert_from.cc +++ b/files/source/convert_from.cc @@ -30,17 +30,17 @@ static __inline int Abs(int v) { } // I420 To any I4xx YUV format with mirroring. -static int I420ToI4xx(const uint8* src_y, +static int I420ToI4xx(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int src_y_width, int src_y_height, @@ -65,20 +65,64 @@ static int I420ToI4xx(const uint8* src_y, return 0; } +// Convert 8 bit YUV to 10 bit. +LIBYUV_API +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + halfheight); + return 0; +} + // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API -int I420ToI422(const uint8* src_y, +int I420ToI422(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -93,17 +137,17 @@ int I420ToI422(const uint8* src_y, // 420 chroma is 1/2 width, 1/2 height // 444 chroma is 1x width, 1x height LIBYUV_API -int I420ToI444(const uint8* src_y, +int I420ToI444(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -117,9 +161,9 @@ int I420ToI444(const uint8* src_y, // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API -int I400Copy(const uint8* src_y, +int I400Copy(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { @@ -137,19 +181,19 @@ int I400Copy(const uint8* src_y, } LIBYUV_API -int I422ToYUY2(const uint8* src_y, +int I422ToYUY2(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_yuy2, + uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; @@ -175,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -195,19 +247,19 @@ int I422ToYUY2(const uint8* src_y, } LIBYUV_API -int I420ToYUY2(const uint8* src_y, +int I420ToYUY2(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_yuy2, + uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; @@ -226,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -242,6 +302,14 @@ int I420ToYUY2(const uint8* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToYUY2Row = I422ToYUY2Row_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -259,19 +327,19 @@ int I420ToYUY2(const uint8* src_y, } LIBYUV_API -int I422ToUYVY(const uint8* src_y, +int I422ToUYVY(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_uyvy, + uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; @@ -297,6 +365,14 @@ int I422ToUYVY(const uint8* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -313,6 +389,14 @@ int I422ToUYVY(const uint8* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -325,19 +409,19 @@ int I422ToUYVY(const uint8* src_y, } LIBYUV_API -int I420ToUYVY(const uint8* src_y, +int I420ToUYVY(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_uyvy, + uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; @@ -356,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -372,6 +464,14 @@ int I420ToUYVY(const uint8* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -390,15 +490,15 @@ int I420ToUYVY(const uint8* src_y, // TODO(fbarchard): test negative height for invert. LIBYUV_API -int I420ToNV12(const uint8* src_y, +int I420ToNV12(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_uv, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { @@ -417,15 +517,15 @@ int I420ToNV12(const uint8* src_y, } LIBYUV_API -int I420ToNV21(const uint8* src_y, +int I420ToNV21(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_vu, + uint8_t* dst_vu, int dst_stride_vu, int width, int height) { @@ -435,20 +535,20 @@ int I420ToNV21(const uint8* src_y, } // Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8* src_y, +static int I420ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgba, + uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { @@ -484,15 +584,6 @@ static int I420ToRGBAMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; - } -#endif #if defined(HAS_I422TORGBAROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGBARow = I422ToRGBARow_Any_MSA; @@ -516,13 +607,13 @@ static int I420ToRGBAMatrix(const uint8* src_y, // Convert I420 to RGBA. LIBYUV_API -int I420ToRGBA(const uint8* src_y, +int I420ToRGBA(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgba, + uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { @@ -533,13 +624,13 @@ int I420ToRGBA(const uint8* src_y, // Convert I420 to BGRA. LIBYUV_API -int I420ToBGRA(const uint8* src_y, +int I420ToBGRA(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_bgra, + uint8_t* dst_bgra, int dst_stride_bgra, int width, int height) { @@ -551,20 +642,20 @@ int I420ToBGRA(const uint8* src_y, } // Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8* src_y, +static int I420ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb24, + uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB24Row_C; if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { @@ -579,7 +670,7 @@ static int I420ToRGB24Matrix(const uint8* src_y, #if defined(HAS_I422TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } @@ -587,7 +678,7 @@ static int I420ToRGB24Matrix(const uint8* src_y, #if defined(HAS_I422TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { I422ToRGB24Row = I422ToRGB24Row_AVX2; } } @@ -623,13 +714,13 @@ static int I420ToRGB24Matrix(const uint8* src_y, // Convert I420 to RGB24. LIBYUV_API -int I420ToRGB24(const uint8* src_y, +int I420ToRGB24(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb24, + uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { @@ -640,13 +731,13 @@ int I420ToRGB24(const uint8* src_y, // Convert I420 to RAW. LIBYUV_API -int I420ToRAW(const uint8* src_y, +int I420ToRAW(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_raw, + uint8_t* dst_raw, int dst_stride_raw, int width, int height) { @@ -657,21 +748,57 @@ int I420ToRAW(const uint8* src_y, width, height); } +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + // Convert I420 to ARGB1555. LIBYUV_API -int I420ToARGB1555(const uint8* src_y, +int I420ToARGB1555(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb1555, + uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height) { int y; - void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || @@ -708,14 +835,6 @@ int I420ToARGB1555(const uint8* src_y, } } #endif -#if defined(HAS_I422TOARGB1555ROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2; - if (IS_ALIGNED(width, 4)) { - I422ToARGB1555Row = I422ToARGB1555Row_DSPR2; - } - } -#endif #if defined(HAS_I422TOARGB1555ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; @@ -740,19 +859,19 @@ int I420ToARGB1555(const uint8* src_y, // Convert I420 to ARGB4444. LIBYUV_API -int I420ToARGB4444(const uint8* src_y, +int I420ToARGB4444(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_argb4444, + uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height) { int y; - void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || @@ -789,14 +908,6 @@ int I420ToARGB4444(const uint8* src_y, } } #endif -#if defined(HAS_I422TOARGB4444ROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2; - if (IS_ALIGNED(width, 4)) { - I422ToARGB4444Row = I422ToARGB4444Row_DSPR2; - } - } -#endif #if defined(HAS_I422TOARGB4444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; @@ -821,20 +932,20 @@ int I420ToARGB4444(const uint8* src_y, // Convert I420 to RGB565 with specified color matrix. LIBYUV_API -int I420ToRGB565Matrix(const uint8* src_y, - int src_stride_y, - const uint8* src_u, - int src_stride_u, - const uint8* src_v, - int src_stride_v, - uint8* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height) { +int I420ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { @@ -893,94 +1004,70 @@ int I420ToRGB565Matrix(const uint8* src_y, // Convert I420 to RGB565. LIBYUV_API -int I420ToRGB565(const uint8* src_y, +int I420ToRGB565(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { - return I420ToRGB565Matrix(src_y, - src_stride_y, - src_u, - src_stride_u, - src_v, - src_stride_v, - dst_rgb565, - dst_stride_rgb565, - &kYuvI601Constants, - width, - height); + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); } // Convert J420 to RGB565. LIBYUV_API -int J420ToRGB565(const uint8* src_y, +int J420ToRGB565(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { - return I420ToRGB565Matrix(src_y, - src_stride_y, - src_u, - src_stride_u, - src_v, - src_stride_v, - dst_rgb565, - dst_stride_rgb565, - &kYuvJPEGConstants, - width, - height); + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvJPEGConstants, width, height); } // Convert H420 to RGB565. LIBYUV_API -int H420ToRGB565(const uint8* src_y, +int H420ToRGB565(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { - return I420ToRGB565Matrix(src_y, - src_stride_y, - src_u, - src_stride_u, - src_v, - src_stride_v, - dst_rgb565, - dst_stride_rgb565, - &kYuvH709Constants, - width, - height); + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvH709Constants, width, height); } // Convert I422 to RGB565. LIBYUV_API -int I422ToRGB565(const uint8* src_y, +int I422ToRGB565(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; - void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { @@ -1036,30 +1123,30 @@ int I422ToRGB565(const uint8* src_y, } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { +static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert I420 to RGB565 with dithering. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, +int I420ToRGB565Dither(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, + const uint8_t* dither4x4, int width, int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -1097,14 +1184,6 @@ int I420ToRGB565Dither(const uint8* src_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; - } -#endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; @@ -1151,8 +1230,8 @@ int I420ToRGB565Dither(const uint8* src_y, for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), - width); // NOLINT + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -1165,20 +1244,111 @@ int I420ToRGB565Dither(const uint8* src_y, return 0; } +// Convert I420 to AR30 with matrix +static int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + // Convert I420 to specified format LIBYUV_API -int ConvertFromI420(const uint8* y, +int ConvertFromI420(const uint8_t* y, int y_stride, - const uint8* u, + const uint8_t* u, int u_stride, - const uint8* v, + const uint8_t* v, int v_stride, - uint8* dst_sample, + uint8_t* dst_sample, int dst_sample_stride, int width, int height, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int r = 0; if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { return -1; @@ -1240,13 +1410,18 @@ int ConvertFromI420(const uint8* y, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; + case FOURCC_AR30: + r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; case FOURCC_I400: r = I400Copy(y, y_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, width, height); break; case FOURCC_NV12: { - uint8* dst_uv = dst_sample + width * height; + uint8_t* dst_uv = dst_sample + width * height; r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_uv, dst_sample_stride ? dst_sample_stride : width, width, @@ -1254,7 +1429,7 @@ int ConvertFromI420(const uint8* y, break; } case FOURCC_NV21: { - uint8* dst_vu = dst_sample + width * height; + uint8_t* dst_vu = dst_sample + width * height; r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_vu, dst_sample_stride ? dst_sample_stride : width, width, @@ -1268,8 +1443,8 @@ int ConvertFromI420(const uint8* y, dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; int halfstride = (dst_sample_stride + 1) / 2; int halfheight = (height + 1) / 2; - uint8* dst_u; - uint8* dst_v; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV12) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + halfstride * halfheight; @@ -1286,8 +1461,8 @@ int ConvertFromI420(const uint8* y, case FOURCC_YV16: { dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; int halfstride = (dst_sample_stride + 1) / 2; - uint8* dst_u; - uint8* dst_v; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV16) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + halfstride * height; @@ -1303,8 +1478,8 @@ int ConvertFromI420(const uint8* y, case FOURCC_I444: case FOURCC_YV24: { dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; - uint8* dst_u; - uint8* dst_v; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV24) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + dst_sample_stride * height; diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc index 88f38279..fbcd039d 100644 --- a/files/source/convert_from_argb.cc +++ b/files/source/convert_from_argb.cc @@ -22,21 +22,21 @@ extern "C" { // ARGB little endian (bgra in memory) to I444 LIBYUV_API -int ARGBToI444(const uint8* src_argb, +int ARGBToI444(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV444Row_C; + void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, + uint8_t* dst_v, int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -76,6 +76,14 @@ int ARGBToI444(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_MMI; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -100,14 +108,6 @@ int ARGBToI444(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -116,6 +116,14 @@ int ARGBToI444(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -130,20 +138,21 @@ int ARGBToI444(const uint8* src_argb, // ARGB little endian (bgra in memory) to I422 LIBYUV_API -int ARGBToI422(const uint8* src_argb, +int ARGBToI422(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -197,22 +206,6 @@ int ARGBToI422(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; - } - } -#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { @@ -231,6 +224,23 @@ int ARGBToI422(const uint8* src_argb, } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); @@ -243,22 +253,23 @@ int ARGBToI422(const uint8* src_argb, } LIBYUV_API -int ARGBToNV12(const uint8* src_argb, +int ARGBToNV12(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_uv, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } @@ -320,6 +331,22 @@ int ARGBToNV12(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -344,22 +371,6 @@ int ARGBToNV12(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; - } - } -#endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow_ = MergeUVRow_Any_MSA; @@ -368,10 +379,18 @@ int ARGBToNV12(const uint8* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow_ = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow_ = MergeUVRow_MMI; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); @@ -394,23 +413,24 @@ int ARGBToNV12(const uint8* src_argb, // Same as NV12 but U and V swapped. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, +int ARGBToNV21(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_uv, - int dst_stride_uv, + uint8_t* dst_vu, + int dst_stride_vu, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -471,6 +491,23 @@ int ARGBToNV21(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif + #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -495,19 +532,162 @@ int ARGBToNV21(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow_ = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow_ = MergeUVRow_MMI; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +LIBYUV_API +int ABGRToNV12(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToYRow = ABGRToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; + ABGRToYRow = ABGRToYRow_MMI; } } #endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; +#if defined(HAS_ABGRTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; + ABGRToUVRow = ABGRToUVRow_MMI; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; } } #endif @@ -519,24 +699,32 @@ int ARGBToNV21(const uint8* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow_ = MergeUVRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + MergeUVRow_ = MergeUVRow_MMI; + } + } +#endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; + ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); + ABGRToUVRow(src_abgr, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); } free_aligned_buffer_64(row_u); } @@ -545,19 +733,20 @@ int ARGBToNV21(const uint8* src_argb, // Convert ARGB to YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, +int ARGBToYUY2(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_yuy2, + uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { @@ -627,6 +816,22 @@ int ARGBToYUY2(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -635,6 +840,14 @@ int ARGBToYUY2(const uint8* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -643,22 +856,6 @@ int ARGBToYUY2(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; - } - } -#endif #if defined(HAS_I422TOYUY2ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToYUY2Row = I422ToYUY2Row_Any_MSA; @@ -667,12 +864,20 @@ int ARGBToYUY2(const uint8* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToYUY2Row = I422ToYUY2Row_MMI; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -689,19 +894,20 @@ int ARGBToYUY2(const uint8* src_argb, // Convert ARGB to UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, +int ARGBToUYVY(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_uyvy, + uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u, - uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { @@ -771,6 +977,22 @@ int ARGBToUYVY(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVRow = ARGBToUVRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_MMI; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -779,6 +1001,14 @@ int ARGBToUYVY(const uint8* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -787,22 +1017,6 @@ int ARGBToUYVY(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToUVRow = ARGBToUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_DSPR2; - } - } -#endif #if defined(HAS_I422TOUYVYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToUYVYRow = I422ToUYVYRow_Any_MSA; @@ -811,12 +1025,20 @@ int ARGBToUYVY(const uint8* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + I422ToUYVYRow = I422ToUYVYRow_MMI; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -833,14 +1055,14 @@ int ARGBToUYVY(const uint8* src_argb, // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, +int ARGBToI400(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || width <= 0 || height == 0) { return -1; @@ -880,14 +1102,6 @@ int ARGBToI400(const uint8* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ARGBToYRow = ARGBToYRow_Any_DSPR2; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_DSPR2; - } - } -#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -896,6 +1110,14 @@ int ARGBToI400(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYRow = ARGBToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -906,31 +1128,31 @@ int ARGBToI400(const uint8* src_argb, } // Shuffle table for converting ARGB to RGBA. -static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, - 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; // Convert ARGB to RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, +int ARGBToRGBA(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_rgba, + uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, - (const uint8*)(&kShuffleMaskARGBToRGBA), width, height); + (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); } // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, +int ARGBToRGB24(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_rgb24, + uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; - void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRGB24Row_C; if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { return -1; @@ -954,6 +1176,22 @@ int ARGBToRGB24(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; + } + } +#endif #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; @@ -970,6 +1208,14 @@ int ARGBToRGB24(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB24Row = ARGBToRGB24Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -981,14 +1227,14 @@ int ARGBToRGB24(const uint8* src_argb, // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, +int ARGBToRAW(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_raw, + uint8_t* dst_raw, int dst_stride_raw, int width, int height) { int y; - void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRAWRow_C; if (!src_argb || !dst_raw || width <= 0 || height == 0) { return -1; @@ -1012,6 +1258,14 @@ int ARGBToRAW(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; @@ -1028,6 +1282,14 @@ int ARGBToRAW(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRAWRow = ARGBToRAWRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -1038,22 +1300,22 @@ int ARGBToRAW(const uint8* src_argb, } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { +static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, +int ARGBToRGB565Dither(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, + const uint8_t* dither4x4, int width, int height) { int y; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -1098,11 +1360,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), - width); /* NOLINT */ + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } @@ -1112,15 +1382,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, // Convert ARGB To RGB565. // TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, +int ARGBToRGB565(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToRGB565Row_C; + void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToRGB565Row_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -1167,6 +1437,14 @@ int ARGBToRGB565(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -1178,15 +1456,15 @@ int ARGBToRGB565(const uint8* src_argb, // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, +int ARGBToARGB1555(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb1555, + uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height) { int y; - void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB1555Row_C; + void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB1555Row_C; if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { return -1; } @@ -1233,6 +1511,14 @@ int ARGBToARGB1555(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1244,15 +1530,15 @@ int ARGBToARGB1555(const uint8* src_argb, // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, +int ARGBToARGB4444(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb4444, + uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height) { int y; - void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB4444Row_C; + void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB4444Row_C; if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { return -1; } @@ -1299,6 +1585,14 @@ int ARGBToARGB4444(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1308,22 +1602,121 @@ int ARGBToARGB4444(const uint8* src_argb, return 0; } +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR30Row = ARGBToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, +int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_yj, + uint8_t* dst_yj, int dst_stride_yj, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -1376,6 +1769,14 @@ int ARGBToJ420(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVJRow = ARGBToUVJRow_Any_MSA; @@ -1384,6 +1785,14 @@ int ARGBToJ420(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -1403,20 +1812,21 @@ int ARGBToJ420(const uint8* src_argb, // Convert ARGB to J422. (JPeg full range I422). LIBYUV_API -int ARGBToJ422(const uint8* src_argb, +int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_yj, + uint8_t* dst_yj, int dst_stride_yj, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -1476,6 +1886,14 @@ int ARGBToJ422(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVJRow = ARGBToUVJRow_Any_MSA; @@ -1484,6 +1902,14 @@ int ARGBToJ422(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MMI; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -1498,14 +1924,14 @@ int ARGBToJ422(const uint8* src_argb, // Convert ARGB to J400. LIBYUV_API -int ARGBToJ400(const uint8* src_argb, +int ARGBToJ400(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_yj, + uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; @@ -1553,6 +1979,14 @@ int ARGBToJ400(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc index 216a9f26..f440c7c2 100644 --- a/files/source/convert_jpeg.cc +++ b/files/source/convert_jpeg.cc @@ -22,18 +22,18 @@ extern "C" { #ifdef HAVE_JPEG struct I420Buffers { - uint8* y; + uint8_t* y; int y_stride; - uint8* u; + uint8_t* u; int u_stride; - uint8* v; + uint8_t* v; int v_stride; int w; int h; }; static void JpegCopyI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); @@ -47,7 +47,7 @@ static void JpegCopyI420(void* opaque, } static void JpegI422ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); @@ -61,7 +61,7 @@ static void JpegI422ToI420(void* opaque, } static void JpegI444ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); @@ -75,7 +75,7 @@ static void JpegI444ToI420(void* opaque, } static void JpegI400ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); @@ -89,9 +89,12 @@ static void JpegI400ToI420(void* opaque, // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) { +int MJPGSize(const uint8_t* src_mjpg, + size_t src_size_mjpg, + int* width, + int* height) { MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); if (ret) { *width = mjpeg_decoder.GetWidth(); *height = mjpeg_decoder.GetHeight(); @@ -101,36 +104,38 @@ int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) { } // MJPG (Motion JPeg) to I420 -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToI420(const uint8* sample, - size_t sample_size, - uint8* y, - int y_stride, - uint8* u, - int u_stride, - uint8* v, - int v_stride, - int w, - int h, - int dw, - int dh) { - if (sample_size == kUnknownDataSize) { +int MJPGToI420(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && - (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) { + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh}; + I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -140,7 +145,8 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, + dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && @@ -151,7 +157,8 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, + dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && @@ -162,18 +169,158 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, + dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, + dst_height); } else { - // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. - // ERROR: Unable to convert MJPEG frame because format is not supported + // TODO(fbarchard): Implement conversion for any other + // colorspace/subsample factors that occur in practice. ERROR: Unable to + // convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +struct NV21Buffers { + uint8_t* y; + int y_stride; + uint8_t* vu; + int vu_stride; + int w; + int h; +}; + +static void JpegI420ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI422ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI444ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI400ToNV21(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, + dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to NV21 +LIBYUV_API +int MJPGToNV21(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu, + dst_stride_vu, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width, + dst_height); + } else { + // Unknown colorspace. mjpeg_decoder.UnloadFrame(); return 1; } @@ -181,16 +328,15 @@ int MJPGToI420(const uint8* sample, return ret ? 0 : 1; } -#ifdef HAVE_JPEG struct ARGBBuffers { - uint8* argb; + uint8_t* argb; int argb_stride; int w; int h; }; static void JpegI420ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); @@ -201,7 +347,7 @@ static void JpegI420ToARGB(void* opaque, } static void JpegI422ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); @@ -212,7 +358,7 @@ static void JpegI422ToARGB(void* opaque, } static void JpegI444ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); @@ -223,7 +369,7 @@ static void JpegI444ToARGB(void* opaque, } static void JpegI400ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); @@ -233,32 +379,33 @@ static void JpegI400ToARGB(void* opaque, } // MJPG (Motion JPeg) to ARGB -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToARGB(const uint8* sample, - size_t sample_size, - uint8* argb, - int argb_stride, - int w, - int h, - int dw, - int dh) { - if (sample_size == kUnknownDataSize) { +int MJPGToARGB(const uint8_t* src_mjpg, + size_t src_size_mjpg, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (src_size_mjpg == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && - (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) { + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - ARGBBuffers bufs = {argb, argb_stride, dw, dh}; + ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -268,7 +415,8 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, + dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && @@ -279,7 +427,8 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, + dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && @@ -290,27 +439,28 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, + dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, + dst_height); } else { - // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. - // ERROR: Unable to convert MJPEG frame because format is not supported + // TODO(fbarchard): Implement conversion for any other + // colorspace/subsample factors that occur in practice. ERROR: Unable to + // convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; } } return ret ? 0 : 1; } -#endif -#endif +#endif // HAVE_JPEG #ifdef __cplusplus } // extern "C" diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc index 63a5104b..bde1aa88 100644 --- a/files/source/convert_to_argb.cc +++ b/files/source/convert_to_argb.cc @@ -28,11 +28,19 @@ extern "C" { // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. + +// TODO(fbarchard): Add the following: +// H010ToARGB +// I010ToARGB +// J400ToARGB +// J422ToARGB +// J444ToARGB + LIBYUV_API -int ConvertToARGB(const uint8* sample, +int ConvertToARGB(const uint8_t* sample, size_t sample_size, - uint8* crop_argb, - int argb_stride, + uint8_t* dst_argb, + int dst_stride_argb, int crop_x, int crop_y, int src_width, @@ -40,11 +48,11 @@ int ConvertToARGB(const uint8* sample, int crop_width, int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; int abs_src_height = (src_height < 0) ? -src_height : src_height; int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; @@ -52,17 +60,17 @@ int ConvertToARGB(const uint8* sample, // One pass rotation is available for some formats. For the rest, convert // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, // and then rotate the ARGB to the final destination buffer. - // For in-place conversion, if destination crop_argb is same as source sample, + // For in-place conversion, if destination dst_argb is same as source sample, // also enable temporary buffer. LIBYUV_BOOL need_buf = - (rotation && format != FOURCC_ARGB) || crop_argb == sample; - uint8* dest_argb = crop_argb; - int dest_argb_stride = argb_stride; - uint8* rotate_buffer = NULL; + (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8_t* dest_argb = dst_argb; + int dest_dst_stride_argb = dst_stride_argb; + uint8_t* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - if (crop_argb == NULL || sample == NULL || src_width <= 0 || - crop_width <= 0 || src_height == 0 || crop_height == 0) { + if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { return -1; } if (src_height < 0) { @@ -71,104 +79,117 @@ int ConvertToARGB(const uint8* sample, if (need_buf) { int argb_size = crop_width * 4 * abs_crop_height; - rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */ + rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - crop_argb = rotate_buffer; - argb_stride = crop_width * 4; + dst_argb = rotate_buffer; + dst_stride_argb = crop_width * 4; } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride, + r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride, + r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width, + r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width, + r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_ARGB: if (!need_buf && !rotation) { src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, - inv_crop_height); + r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); } break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width, + r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AR30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AB30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, - inv_crop_height); + r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, - inv_crop_height); + r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width, - inv_crop_height); + r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width, + r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; - r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb, - argb_stride, crop_width, inv_crop_height); + src_uv = + sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + src_uv = + sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; // Call NV12 but with u and v parameters swapped. - r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb, - argb_stride, crop_width, inv_crop_height); + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width, + r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; + // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { @@ -183,31 +204,42 @@ int ConvertToARGB(const uint8* sample, halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - crop_argb, argb_stride, crop_width, inv_crop_height); + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_H420: { + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J420: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; - src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - crop_argb, argb_stride, crop_width, inv_crop_height); + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV16) { src_v = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; @@ -220,14 +252,27 @@ int ConvertToARGB(const uint8* sample, halfwidth * (abs_src_height + crop_y) + crop_x / 2; } r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - crop_argb, argb_stride, crop_width, inv_crop_height); + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } + + case FOURCC_H422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -236,12 +281,12 @@ int ConvertToARGB(const uint8* sample, src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, - crop_argb, argb_stride, crop_width, inv_crop_height); + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width, + r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, abs_src_height, crop_width, inv_crop_height); break; #endif @@ -251,13 +296,13 @@ int ConvertToARGB(const uint8* sample, if (need_buf) { if (!r) { - r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride, + r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, crop_width, abs_crop_height, rotation); } free(rotate_buffer); } else if (rotation) { src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width, + r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height, rotation); } diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc index a50689db..584be0ac 100644 --- a/files/source/convert_to_i420.cc +++ b/files/source/convert_to_i420.cc @@ -25,14 +25,14 @@ extern "C" { // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. LIBYUV_API -int ConvertToI420(const uint8* sample, +int ConvertToI420(const uint8_t* sample, size_t sample_size, - uint8* y, - int y_stride, - uint8* u, - int u_stride, - uint8* v, - int v_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, int crop_x, int crop_y, int src_width, @@ -40,11 +40,11 @@ int ConvertToI420(const uint8* sample, int crop_width, int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; const int abs_src_height = (src_height < 0) ? -src_height : src_height; // TODO(nisse): Why allow crop_height < 0? const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; @@ -52,177 +52,189 @@ int ConvertToI420(const uint8* sample, LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && format != FOURCC_NV21 && format != FOURCC_YV12) || - y == sample; - uint8* tmp_y = y; - uint8* tmp_u = u; - uint8* tmp_v = v; - int tmp_y_stride = y_stride; - int tmp_u_stride = u_stride; - int tmp_v_stride = v_stride; - uint8* rotate_buffer = NULL; + dst_y == sample; + uint8_t* tmp_y = dst_y; + uint8_t* tmp_u = dst_u; + uint8_t* tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t* rotate_buffer = NULL; const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; - if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 || - src_height == 0 || crop_height == 0) { + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination y is same as source sample, + // For in-place conversion, if destination dst_y is same as source sample, // also enable temporary buffer. if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */ + rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - y = rotate_buffer; - u = y + y_size; - v = u + uv_size; - y_stride = crop_width; - u_stride = v_stride = ((crop_width + 1) / 2); + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v, - v_stride, crop_width, inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v, - v_stride, crop_width, inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, - v_stride, crop_width, inv_crop_height); + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, - v_stride, crop_width, inv_crop_height); + r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v, - v_stride, crop_width, inv_crop_height); + r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ARGB: src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; + // TODO(fbarchard): Add AR30 and AB30 case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + + src_uv = sample + (src_width * abs_src_height) + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y, - y_stride, u, u_stride, v, v_stride, crop_width, - inv_crop_height, rotation); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + + src_uv = sample + (src_width * abs_src_height) + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - // Call NV12 but with u and v parameters swapped. - r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y, - y_stride, v, v_stride, u, u_stride, crop_width, - inv_crop_height, rotation); + // Call NV12 but with dst_u and dst_v parameters swapped. + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, crop_width, inv_crop_height, rotation); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride, - crop_width, inv_crop_height); + r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { - src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + + (crop_x / 2); src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); } else { - src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + + (crop_x / 2); src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); } - r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y, - y_stride, u, u_stride, v, v_stride, crop_width, - inv_crop_height, rotation); + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { src_v = sample + src_width * abs_src_height + halfwidth * crop_y + - crop_x / 2; + (crop_x / 2); src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); } else { src_u = sample + src_width * abs_src_height + halfwidth * crop_y + - crop_x / 2; + (crop_x / 2); src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); } - r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y, - y_stride, u, u_stride, v, v_stride, crop_width, - inv_crop_height); + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -230,15 +242,16 @@ int ConvertToI420(const uint8* sample, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y, - y_stride, u, u_stride, v, v_stride, crop_width, - inv_crop_height); + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -247,9 +260,10 @@ int ConvertToI420(const uint8* sample, if (need_buf) { if (!r) { - r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride, - tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width, - abs_crop_height, rotation); + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); } free(rotate_buffer); } diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc index afb5d282..48e2b615 100644 --- a/files/source/cpu_id.cc +++ b/files/source/cpu_id.cc @@ -19,16 +19,10 @@ #include <immintrin.h> // For _xgetbv() #endif -#if !defined(__native_client__) -#include <stdlib.h> // For getenv() -#endif - // For ArmCpuCaps() but unittested on all platforms #include <stdio.h> #include <string.h> -#include "libyuv/basic_types.h" // For CPU_X86 - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -43,16 +37,20 @@ extern "C" { #define SAFEBUFFERS #endif +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// TODO(fbarchard): Consider using int for cpuid so casting is not needed. // Low level cpuid for X86. #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API -void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +void CpuId(int info_eax, int info_ecx, int* cpu_info) { #if defined(_MSC_VER) // Visual C version uses intrinsic or inline x86 assembly. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - __cpuidex((int*)(cpu_info), info_eax, info_ecx); + __cpuidex(cpu_info, info_eax, info_ecx); #elif defined(_M_IX86) __asm { mov eax, info_eax @@ -66,14 +64,14 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // Visual C but not x86 if (info_ecx == 0) { - __cpuid((int*)(cpu_info), info_eax); + __cpuid(cpu_info, info_eax); } else { cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; } #endif // GCC version uses inline x86 assembly. #else // defined(_MSC_VER) - uint32 info_ebx, info_edx; + int info_ebx, info_edx; asm volatile( #if defined(__i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. @@ -94,7 +92,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { +void CpuId(int eax, int ecx, int* cpu_info) { (void)eax; (void)ecx; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; @@ -118,9 +116,9 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int GetXCR0() { - uint32 xcr0 = 0u; + int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT #elif defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) @@ -154,7 +152,7 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { } // aarch64 uses asimd for Neon. p = strstr(cpuinfo_line, " asimd"); - if (p && (p[6] == ' ' || p[6] == '\n')) { + if (p) { fclose(f); return kCpuHasNEON; } @@ -164,27 +162,40 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { return 0; } +// TODO(fbarchard): Consider read_msa_ir(). +// TODO(fbarchard): Add unittest. LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, const char ase[]) { char cpuinfo_line[512]; - int len = (int)strlen(ase); FILE* f = fopen(cpuinfo_name, "r"); if (!f) { // ase enabled if /proc/cpuinfo is unavailable. if (strcmp(ase, " msa") == 0) { return kCpuHasMSA; } - return kCpuHasDSPR2; + if (strcmp(ase, " mmi") == 0) { + return kCpuHasMMI; + } + return 0; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { char* p = strstr(cpuinfo_line, ase); - if (p && (p[len] == ' ' || p[len] == '\n')) { + if (p) { fclose(f); if (strcmp(ase, " msa") == 0) { return kCpuHasMSA; } - return kCpuHasDSPR2; + return 0; + } + } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + char* p = strstr(cpuinfo_line, "Loongson-3"); + if (p) { + fclose(f); + if (strcmp(ase, " mmi") == 0) { + return kCpuHasMMI; + } + return 0; } } } @@ -192,35 +203,14 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, return 0; } -// CPU detect function for SIMD instruction sets. -LIBYUV_API -int cpu_info_ = 0; // cpu_info is not initialized yet. - -// Test environment variable for disabling CPU features. Any non-zero value -// to disable. Zero ignored to make it easy to set the variable on/off. -#if !defined(__native_client__) && !defined(_M_ARM) - -static LIBYUV_BOOL TestEnv(const char* name) { - const char* var = getenv(name); - if (var) { - if (var[0] != '0') { - return LIBYUV_TRUE; - } - } - return LIBYUV_FALSE; -} -#else // nacl does not support getenv(). -static LIBYUV_BOOL TestEnv(const char*) { - return LIBYUV_FALSE; -} -#endif - -LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { +static SAFEBUFFERS int GetCpuFlags(void) { int cpu_info = 0; -#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) - uint32 cpu_info0[4] = {0, 0, 0, 0}; - uint32 cpu_info1[4] = {0, 0, 0, 0}; - uint32 cpu_info7[4] = {0, 0, 0, 0}; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); if (cpu_info0[0] >= 7) { @@ -241,60 +231,23 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; } } - - // Environment variable overrides for testing. - if (TestEnv("LIBYUV_DISABLE_X86")) { - cpu_info &= ~kCpuHasX86; - } - if (TestEnv("LIBYUV_DISABLE_SSE2")) { - cpu_info &= ~kCpuHasSSE2; - } - if (TestEnv("LIBYUV_DISABLE_SSSE3")) { - cpu_info &= ~kCpuHasSSSE3; - } - if (TestEnv("LIBYUV_DISABLE_SSE41")) { - cpu_info &= ~kCpuHasSSE41; - } - if (TestEnv("LIBYUV_DISABLE_SSE42")) { - cpu_info &= ~kCpuHasSSE42; - } - if (TestEnv("LIBYUV_DISABLE_AVX")) { - cpu_info &= ~kCpuHasAVX; - } - if (TestEnv("LIBYUV_DISABLE_AVX2")) { - cpu_info &= ~kCpuHasAVX2; - } - if (TestEnv("LIBYUV_DISABLE_ERMS")) { - cpu_info &= ~kCpuHasERMS; - } - if (TestEnv("LIBYUV_DISABLE_FMA3")) { - cpu_info &= ~kCpuHasFMA3; - } - if (TestEnv("LIBYUV_DISABLE_AVX3")) { - cpu_info &= ~kCpuHasAVX3; - } - if (TestEnv("LIBYUV_DISABLE_F16C")) { - cpu_info &= ~kCpuHasF16C; - } - #endif #if defined(__mips__) && defined(__linux__) -#if defined(__mips_dspr2) - cpu_info |= kCpuHasDSPR2; -#endif #if defined(__mips_msa) cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); +#elif defined(_MIPS_ARCH_LOONGSON3A) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi"); #endif cpu_info |= kCpuHasMIPS; - if (getenv("LIBYUV_DISABLE_DSPR2")) { - cpu_info &= ~kCpuHasDSPR2; - } - if (getenv("LIBYUV_DISABLE_MSA")) { - cpu_info &= ~kCpuHasMSA; - } #endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ @@ -313,22 +266,22 @@ LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { cpu_info = ArmCpuCaps("/proc/cpuinfo"); #endif cpu_info |= kCpuHasARM; - if (TestEnv("LIBYUV_DISABLE_NEON")) { - cpu_info &= ~kCpuHasNEON; - } #endif // __arm__ - if (TestEnv("LIBYUV_DISABLE_ASM")) { - cpu_info = 0; - } cpu_info |= kCpuInitialized; - cpu_info_ = cpu_info; return cpu_info; } // Note that use of this function is not thread safe. LIBYUV_API -void MaskCpuFlags(int enable_flags) { - cpu_info_ = InitCpuFlags() & enable_flags; +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); } #ifdef __cplusplus diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc index 3acf9563..5c5e5ead 100644 --- a/files/source/mjpeg_decoder.cc +++ b/files/source/mjpeg_decoder.cc @@ -25,7 +25,8 @@ #endif #endif -struct FILE; // For jpeglib.h. + +#include <stdio.h> // For jpeglib.h. // C++ build requires extern C for jpeg internals. #ifdef __cplusplus @@ -102,7 +103,7 @@ MJpegDecoder::~MJpegDecoder() { DestroyOutputBuffers(); } -LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { if (!ValidateJpeg(src, src_len)) { return LIBYUV_FALSE; } @@ -129,7 +130,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (scanlines_[i]) { delete scanlines_[i]; } - scanlines_[i] = new uint8*[scanlines_size]; + scanlines_[i] = new uint8_t*[scanlines_size]; scanlines_sizes_[i] = scanlines_size; } @@ -145,7 +146,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (databuf_[i]) { delete databuf_[i]; } - databuf_[i] = new uint8[databuf_size]; + databuf_[i] = new uint8_t[databuf_size]; databuf_strides_[i] = databuf_stride; } @@ -243,7 +244,7 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() { } // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. -LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes, +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height) { if (dst_width != GetWidth() || dst_height > GetHeight()) { @@ -477,9 +478,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { // it. DestroyOutputBuffers(); - scanlines_ = new uint8**[num_outbufs]; + scanlines_ = new uint8_t**[num_outbufs]; scanlines_sizes_ = new int[num_outbufs]; - databuf_ = new uint8*[num_outbufs]; + databuf_ = new uint8_t*[num_outbufs]; databuf_strides_ = new int[num_outbufs]; for (int i = 0; i < num_outbufs; ++i) { @@ -535,9 +536,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() { return LIBYUV_TRUE; } -void MJpegDecoder::SetScanlinePointers(uint8** data) { +void MJpegDecoder::SetScanlinePointers(uint8_t** data) { for (int i = 0; i < num_outbufs_; ++i) { - uint8* data_i = data[i]; + uint8_t* data_i = data[i]; for (int j = 0; j < scanlines_sizes_[i]; ++j) { scanlines_[i][j] = data_i; data_i += GetComponentStride(i); @@ -560,13 +561,13 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { return kJpegYuv420; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 1 && - subsample_x[2] == 2 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { return kJpegYuv422; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 1 && subsample_y[1] == 1 && - subsample_x[2] == 1 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && + subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { return kJpegYuv444; } } else if (number_of_components == 1) { // Grey-scale images. diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc index cc38b99a..ba0a03ab 100644 --- a/files/source/mjpeg_validate.cc +++ b/files/source/mjpeg_validate.cc @@ -18,13 +18,13 @@ extern "C" { #endif // Helper function to scan for EOI marker (0xff 0xd9). -static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { - if (sample_size >= 2) { - const uint8* end = sample + sample_size - 1; - const uint8* it = sample; +static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) { + if (src_size_mjpg >= 2) { + const uint8_t* end = src_mjpg + src_size_mjpg - 1; + const uint8_t* it = src_mjpg; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. - it = static_cast<const uint8*>(memchr(it, 0xff, end - it)); + it = (const uint8_t*)(memchr(it, 0xff, end - it)); if (it == NULL) { break; } @@ -34,35 +34,35 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { ++it; // Skip over current 0xff. } } - // ERROR: Invalid jpeg end code not found. Size sample_size + // ERROR: Invalid jpeg end code not found. Size src_size_mjpg return LIBYUV_FALSE; } // Helper function to validate the jpeg appears intact. -LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { +LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) { // Maximum size that ValidateJpeg will consider valid. const size_t kMaxJpegSize = 0x7fffffffull; const size_t kBackSearchSize = 1024; - if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) { - // ERROR: Invalid jpeg size: sample_size + if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) { + // ERROR: Invalid jpeg size: src_size_mjpg return LIBYUV_FALSE; } // SOI marker - if (sample[0] != 0xff || sample[1] != 0xd8 || sample[2] != 0xff) { + if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) { // ERROR: Invalid jpeg initial start code return LIBYUV_FALSE; } // Look for the End Of Image (EOI) marker near the end of the buffer. - if (sample_size > kBackSearchSize) { - if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { + if (src_size_mjpg > kBackSearchSize) { + if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) { return LIBYUV_TRUE; // Success: Valid jpeg. } // Reduce search size for forward search. - sample_size = sample_size - kBackSearchSize + 1; + src_size_mjpg = src_size_mjpg - kBackSearchSize + 1; } // Step over SOI marker and scan for EOI. - return ScanEOI(sample + 2, sample_size - 2); + return ScanEOI(src_mjpg + 2, src_size_mjpg - 2); } #ifdef __cplusplus diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc index b8a53e85..9cab230f 100644 --- a/files/source/planar_functions.cc +++ b/files/source/planar_functions.cc @@ -26,14 +26,14 @@ extern "C" { // Copy a plane of data LIBYUV_API -void CopyPlane(const uint8* src_y, +void CopyPlane(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -50,6 +50,7 @@ void CopyPlane(const uint8* src_y, if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } + #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -70,11 +71,6 @@ void CopyPlane(const uint8* src_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -87,14 +83,14 @@ void CopyPlane(const uint8* src_y, // TODO(fbarchard): Consider support for negative height. // TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API -void CopyPlane_16(const uint16* src_y, +void CopyPlane_16(const uint16_t* src_y, int src_stride_y, - uint16* dst_y, + uint16_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; @@ -116,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, CopyRow = CopyRow_16_NEON; } #endif -#if defined(HAS_COPYROW_16_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_16_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -130,19 +121,119 @@ void CopyPlane_16(const uint16* src_y, } } +// Convert a plane of 16 bit data to 8 bit +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert16To8Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 8 bit data to 16 bit +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, + int width) = Convert8To16Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO16ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Convert8To16Row = Convert8To16Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_SSE2; + } + } +#endif +#if defined(HAS_CONVERT8TO16ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To16Row = Convert8To16Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To16Row = Convert8To16Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To16Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + // Copy I422. LIBYUV_API -int I422Copy(const uint8* src_y, +int I422Copy(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -171,17 +262,17 @@ int I422Copy(const uint8* src_y, // Copy I444. LIBYUV_API -int I444Copy(const uint8* src_y, +int I444Copy(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -209,9 +300,9 @@ int I444Copy(const uint8* src_y, // Copy I400. LIBYUV_API -int I400ToI400(const uint8* src_y, +int I400ToI400(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { @@ -230,13 +321,13 @@ int I400ToI400(const uint8* src_y, // Convert I420 to I400. LIBYUV_API -int I420ToI400(const uint8* src_y, +int I420ToI400(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { @@ -261,16 +352,16 @@ int I420ToI400(const uint8* src_y, // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API -void SplitUVPlane(const uint8* src_uv, +void SplitUVPlane(const uint8_t* src_uv, int src_stride_uv, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; // Negative height means invert the image. if (height < 0) { @@ -311,13 +402,19 @@ void SplitUVPlane(const uint8* src_uv, } } #endif -#if defined(HAS_SPLITUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) && - IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) && - IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_DSPR2; +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; } } #endif @@ -332,18 +429,17 @@ void SplitUVPlane(const uint8* src_uv, } LIBYUV_API -void MergeUVPlane(const uint8* src_u, +void MergeUVPlane(const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_uv, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; - void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - // Coalesce rows. + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -389,6 +485,14 @@ void MergeUVPlane(const uint8* src_u, } } #endif +#if defined(HAS_MERGEUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeUVRow = MergeUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MergeUVRow = MergeUVRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -399,15 +503,204 @@ void MergeUVPlane(const uint8* src_u, } } +// Convert NV21 to NV12. +LIBYUV_API +int NV21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = + UVToVURow_C; + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_vu || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_vu = src_vu + (halfheight - 1) * src_stride_vu; + src_stride_y = -src_stride_y; + src_stride_vu = -src_stride_vu; + } + // Coalesce rows. + if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_vu = dst_stride_uv = 0; + } + +#if defined(HAS_UVToVUROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UVToVURow = UVToVURow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + UVToVURow = UVToVURow_NEON; + } + } +#endif + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + for (y = 0; y < halfheight; ++y) { + UVToVURow(src_vu, dst_uv, halfwidth); + src_vu += src_stride_vu; + dst_uv += dst_stride_uv; + } + return 0; +} + +// Support function for NV12 etc RGB channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + } + // Coalesce rows. + if (src_stride_rgb == width * 3 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } +#if defined(HAS_SPLITRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitRGBRow = SplitRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif +#if defined(HAS_SPLITRGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitRGBRow = SplitRGBRow_Any_MMI; + if (IS_ALIGNED(width, 4)) { + SplitRGBRow = SplitRGBRow_MMI; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of RGB. + SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_rgb += src_stride_rgb; + } +} + +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height) { + int y; + void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_rgb, int width) = + MergeRGBRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_rgb == width * 3) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; + } +#if defined(HAS_MERGERGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MergeRGBRow = MergeRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_MERGERGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeRGBRow = MergeRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_NEON; + } + } +#endif +#if defined(HAS_MERGERGBROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MergeRGBRow = MergeRGBRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MergeRGBRow = MergeRGBRow_MMI; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + // Mirror a plane of data. -void MirrorPlane(const uint8* src_y, +void MirrorPlane(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -438,14 +731,6 @@ void MirrorPlane(const uint8* src_y, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) && - IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) && - IS_ALIGNED(dst_stride_y, 4)) { - MirrorRow = MirrorRow_DSPR2; - } -#endif #if defined(HAS_MIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MirrorRow = MirrorRow_Any_MSA; @@ -454,6 +739,14 @@ void MirrorPlane(const uint8* src_y, } } #endif +#if defined(HAS_MIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MirrorRow = MirrorRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MirrorRow = MirrorRow_MMI; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -465,20 +758,20 @@ void MirrorPlane(const uint8* src_y, // Convert YUY2 to I422. LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, +int YUY2ToI422(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int width) = YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, + uint8_t* dst_v, int width) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -537,6 +830,16 @@ int YUY2ToI422(const uint8* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + YUY2ToUV422Row = YUY2ToUV422Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -551,20 +854,20 @@ int YUY2ToI422(const uint8* src_yuy2, // Convert UYVY to I422. LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, +int UYVYToI422(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; - void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, - int width) = UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) = + void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, + uint8_t* dst_v, int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = UYVYToYRow_C; if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -623,6 +926,16 @@ int UYVYToI422(const uint8* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + UYVYToYRow = UYVYToYRow_Any_MMI; + UYVYToUV422Row = UYVYToUV422Row_Any_MMI; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_MMI; + UYVYToUV422Row = UYVYToUV422Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -637,14 +950,14 @@ int UYVYToI422(const uint8* src_uyvy, // Convert YUY2 to Y. LIBYUV_API -int YUY2ToY(const uint8* src_yuy2, +int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { return -1; @@ -693,6 +1006,14 @@ int YUY2ToY(const uint8* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + YUY2ToYRow = YUY2ToYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToYRow(src_yuy2, dst_y, width); @@ -704,9 +1025,9 @@ int YUY2ToY(const uint8* src_yuy2, // Mirror I400 with optional flipping LIBYUV_API -int I400Mirror(const uint8* src_y, +int I400Mirror(const uint8_t* src_y, int src_stride_y, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { @@ -726,17 +1047,17 @@ int I400Mirror(const uint8* src_y, // Mirror I420 with optional flipping LIBYUV_API -int I420Mirror(const uint8* src_y, +int I420Mirror(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { @@ -768,14 +1089,14 @@ int I420Mirror(const uint8* src_y, // ARGB mirror. LIBYUV_API -int ARGBMirror(const uint8* src_argb, +int ARGBMirror(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = ARGBMirrorRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -818,6 +1139,14 @@ int ARGBMirror(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMirrorRow = ARGBMirrorRow_MMI; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -833,8 +1162,8 @@ int ARGBMirror(const uint8* src_argb, // the same blend function for all pixels if possible. LIBYUV_API ARGBBlendRow GetARGBBlend() { - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = ARGBBlendRow_C; + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = ARGBBlendRow_C; #if defined(HAS_ARGBBLENDROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBBlendRow = ARGBBlendRow_SSSE3; @@ -846,22 +1175,32 @@ ARGBBlendRow GetARGBBlend() { ARGBBlendRow = ARGBBlendRow_NEON; } #endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } +#endif +#if defined(HAS_ARGBBLENDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBBlendRow = ARGBBlendRow_MMI; + } +#endif return ARGBBlendRow; } // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API -int ARGBBlend(const uint8* src_argb0, +int ARGBBlend(const uint8_t* src_argb0, int src_stride_argb0, - const uint8* src_argb1, + const uint8_t* src_argb1, int src_stride_argb1, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = GetARGBBlend(); + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = GetARGBBlend(); if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -890,19 +1229,19 @@ int ARGBBlend(const uint8* src_argb0, // Alpha Blend plane and store to destination. LIBYUV_API -int BlendPlane(const uint8* src_y0, +int BlendPlane(const uint8_t* src_y0, int src_stride_y0, - const uint8* src_y1, + const uint8_t* src_y1, int src_stride_y1, - const uint8* alpha, + const uint8_t* alpha, int alpha_stride, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = BlendPlaneRow_C; if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { return -1; @@ -938,6 +1277,14 @@ int BlendPlane(const uint8* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BlendPlaneRow = BlendPlaneRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); @@ -952,36 +1299,36 @@ int BlendPlane(const uint8* src_y0, #define MAXTWIDTH 2048 // Alpha Blend YUV images and store to destination. LIBYUV_API -int I420Blend(const uint8* src_y0, +int I420Blend(const uint8_t* src_y0, int src_stride_y0, - const uint8* src_u0, + const uint8_t* src_u0, int src_stride_u0, - const uint8* src_v0, + const uint8_t* src_v0, int src_stride_v0, - const uint8* src_y1, + const uint8_t* src_y1, int src_stride_y1, - const uint8* src_u1, + const uint8_t* src_u1, int src_stride_u1, - const uint8* src_v1, + const uint8_t* src_v1, int src_stride_v1, - const uint8* alpha, + const uint8_t* alpha, int alpha_stride, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; // Half width/height for UV. int halfwidth = (width + 1) >> 1; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = BlendPlaneRow_C; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -1014,6 +1361,14 @@ int I420Blend(const uint8* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + BlendPlaneRow = BlendPlaneRow_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + BlendPlaneRow = BlendPlaneRow_MMI; + } + } +#endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } @@ -1050,6 +1405,17 @@ int I420Blend(const uint8* src_y0, } } #endif +#if defined(HAS_SCALEROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI; + if (IS_ALIGNED(width, 2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_MMI; + if (IS_ALIGNED(halfwidth, 8)) { + ScaleRowDown2 = ScaleRowDown2Box_MMI; + } + } + } +#endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); @@ -1076,17 +1442,17 @@ int I420Blend(const uint8* src_y0, // Multiply 2 ARGB images and store to destination. LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, +int ARGBMultiply(const uint8_t* src_argb0, int src_stride_argb0, - const uint8* src_argb1, + const uint8_t* src_argb1, int src_stride_argb1, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBMultiplyRow_C; + void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBMultiplyRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1135,6 +1501,14 @@ int ARGBMultiply(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMultiplyRow = ARGBMultiplyRow_MMI; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -1148,16 +1522,16 @@ int ARGBMultiply(const uint8* src_argb0, // Add 2 ARGB images and store to destination. LIBYUV_API -int ARGBAdd(const uint8* src_argb0, +int ARGBAdd(const uint8_t* src_argb0, int src_stride_argb0, - const uint8* src_argb1, + const uint8_t* src_argb1, int src_stride_argb1, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBAddRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1212,6 +1586,14 @@ int ARGBAdd(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAddRow = ARGBAddRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAddRow = ARGBAddRow_MMI; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1225,17 +1607,17 @@ int ARGBAdd(const uint8* src_argb0, // Subtract 2 ARGB images and store to destination. LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, +int ARGBSubtract(const uint8_t* src_argb0, int src_stride_argb0, - const uint8* src_argb1, + const uint8_t* src_argb1, int src_stride_argb1, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBSubtractRow_C; + void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBSubtractRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1284,6 +1666,14 @@ int ARGBSubtract(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBSubtractRow = ARGBSubtractRow_MMI; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -1295,20 +1685,20 @@ int ARGBSubtract(const uint8* src_argb0, return 0; } // Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8* src_y, +static int I422ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgba, + uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { @@ -1344,15 +1734,6 @@ static int I422ToRGBAMatrix(const uint8* src_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; - } -#endif #if defined(HAS_I422TORGBAROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGBARow = I422ToRGBARow_Any_MSA; @@ -1374,13 +1755,13 @@ static int I422ToRGBAMatrix(const uint8* src_y, // Convert I422 to RGBA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, +int I422ToRGBA(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_rgba, + uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { @@ -1391,13 +1772,13 @@ int I422ToRGBA(const uint8* src_y, // Convert I422 to BGRA. LIBYUV_API -int I422ToBGRA(const uint8* src_y, +int I422ToBGRA(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_bgra, + uint8_t* dst_bgra, int dst_stride_bgra, int width, int height) { @@ -1410,17 +1791,17 @@ int I422ToBGRA(const uint8* src_y, // Convert NV12 to RGB565. LIBYUV_API -int NV12ToRGB565(const uint8* src_y, +int NV12ToRGB565(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_rgb565, + uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; void (*NV12ToRGB565Row)( - const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -1477,14 +1858,14 @@ int NV12ToRGB565(const uint8* src_y, // Convert RAW to RGB24. LIBYUV_API -int RAWToRGB24(const uint8* src_raw, +int RAWToRGB24(const uint8_t* src_raw, int src_stride_raw, - uint8* dst_rgb24, + uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; - void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) = + void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = RAWToRGB24Row_C; if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; @@ -1525,6 +1906,14 @@ int RAWToRGB24(const uint8* src_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MMI; + if (IS_ALIGNED(width, 4)) { + RAWToRGB24Row = RAWToRGB24Row_MMI; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -1535,13 +1924,13 @@ int RAWToRGB24(const uint8* src_raw, } LIBYUV_API -void SetPlane(uint8* dst_y, +void SetPlane(uint8_t* dst_y, int dst_stride_y, int width, int height, - uint32 value) { + uint32_t value) { int y; - void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C; + void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1574,6 +1963,11 @@ void SetPlane(uint8* dst_y, SetRow = SetRow_ERMS; } #endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1584,11 +1978,11 @@ void SetPlane(uint8* dst_y, // Draw a rectangle into I420 LIBYUV_API -int I420Rect(uint8* dst_y, +int I420Rect(uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int x, int y, @@ -1599,9 +1993,9 @@ int I420Rect(uint8* dst_y, int value_v) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - uint8* start_y = dst_y + y * dst_stride_y + x; - uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); - uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + uint8_t* start_y = dst_y + y * dst_stride_y + x; + uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { @@ -1616,15 +2010,16 @@ int I420Rect(uint8* dst_y, // Draw a rectangle into ARGB LIBYUV_API -int ARGBRect(uint8* dst_argb, +int ARGBRect(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height, - uint32 value) { + uint32_t value) { int y; - void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C; + void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = + ARGBSetRow_C; if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1685,15 +2080,15 @@ int ARGBRect(uint8* dst_argb, // f is foreground pixel premultiplied by alpha LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, +int ARGBAttenuate(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBAttenuateRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1740,6 +2135,14 @@ int ARGBAttenuate(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -1751,14 +2154,14 @@ int ARGBAttenuate(const uint8* src_argb, // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, +int ARGBUnattenuate(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBUnattenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1802,14 +2205,14 @@ int ARGBUnattenuate(const uint8* src_argb, // Convert ARGB to Grayed ARGB. LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, +int ARGBGrayTo(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBGrayRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1840,6 +2243,11 @@ int ARGBGrayTo(const uint8* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBGrayRow = ARGBGrayRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -1851,16 +2259,16 @@ int ARGBGrayTo(const uint8* src_argb, // Make a rectangle of ARGB gray scale. LIBYUV_API -int ARGBGray(uint8* dst_argb, +int ARGBGray(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBGrayRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1885,6 +2293,11 @@ int ARGBGray(uint8* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBGrayRow = ARGBGrayRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); @@ -1895,15 +2308,15 @@ int ARGBGray(uint8* dst_argb, // Make a rectangle of ARGB Sepia tone. LIBYUV_API -int ARGBSepia(uint8* dst_argb, +int ARGBSepia(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height) { int y; - void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1928,6 +2341,11 @@ int ARGBSepia(uint8* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif +#if defined(HAS_ARGBSEPIAROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBSepiaRow = ARGBSepiaRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); @@ -1939,16 +2357,16 @@ int ARGBSepia(uint8* dst_argb, // Apply a 4x4 matrix to each ARGB pixel. // Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, +int ARGBColorMatrix(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, - const int8* matrix_argb, + const int8_t* matrix_argb, int width, int height) { int y; - void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) = + void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const int8_t* matrix_argb, int width) = ARGBColorMatrixRow_C; if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; @@ -1974,6 +2392,16 @@ int ARGBColorMatrix(const uint8* src_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } #endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); src_argb += src_stride_argb; @@ -1985,15 +2413,15 @@ int ARGBColorMatrix(const uint8* src_argb, // Apply a 4x3 matrix to each ARGB pixel. // Deprecated. LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, +int RGBColorMatrix(uint8_t* dst_argb, int dst_stride_argb, - const int8* matrix_rgb, + const int8_t* matrix_rgb, int dst_x, int dst_y, int width, int height) { - SIMD_ALIGNED(int8 matrix_argb[16]); - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + SIMD_ALIGNED(int8_t matrix_argb[16]); + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -2015,24 +2443,24 @@ int RGBColorMatrix(uint8* dst_argb, matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; matrix_argb[15] = 64; // 1.0 - return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst, + return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, dst_stride_argb, &matrix_argb[0], width, height); } // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API -int ARGBColorTable(uint8* dst_argb, +int ARGBColorTable(uint8_t* dst_argb, int dst_stride_argb, - const uint8* table_argb, + const uint8_t* table_argb, int dst_x, int dst_y, int width, int height) { int y; - void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb, + void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = ARGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -2058,17 +2486,17 @@ int ARGBColorTable(uint8* dst_argb, // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API -int RGBColorTable(uint8* dst_argb, +int RGBColorTable(uint8_t* dst_argb, int dst_stride_argb, - const uint8* table_argb, + const uint8_t* table_argb, int dst_x, int dst_y, int width, int height) { int y; - void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb, + void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = RGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -2101,7 +2529,7 @@ int RGBColorTable(uint8* dst_argb, // Caveat - although SSE2 saturates, the C function does not and should be used // with care if doing anything but quantization. LIBYUV_API -int ARGBQuantize(uint8* dst_argb, +int ARGBQuantize(uint8_t* dst_argb, int dst_stride_argb, int scale, int interval_size, @@ -2111,9 +2539,9 @@ int ARGBQuantize(uint8* dst_argb, int width, int height) { int y; - void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size, + void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || interval_size < 1 || interval_size > 255) { return -1; @@ -2134,6 +2562,11 @@ int ARGBQuantize(uint8* dst_argb, ARGBQuantizeRow = ARGBQuantizeRow_NEON; } #endif +#if defined(HAS_ARGBQUANTIZEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); dst += dst_stride_argb; @@ -2144,17 +2577,17 @@ int ARGBQuantize(uint8* dst_argb, // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, +int ARGBComputeCumulativeSum(const uint8_t* src_argb, int src_stride_argb, - int32* dst_cumsum, + int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height) { int y; - void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) = + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - int32* previous_cumsum = dst_cumsum; + int32_t* previous_cumsum = dst_cumsum; if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } @@ -2163,6 +2596,12 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. for (y = 0; y < height; ++y) { ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); @@ -2178,25 +2617,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API -int ARGBBlur(const uint8* src_argb, +int ARGBBlur(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, - int32* dst_cumsum, + int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height, int radius) { int y; - void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) = + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, - int count) = CumulativeSumToAverageRow_C; - int32* cumsum_bot_row; - int32* max_cumsum_bot_row; - int32* cumsum_top_row; + void (*CumulativeSumToAverageRow)( + const int32_t* topleft, const int32_t* botleft, int width, int area, + uint8_t* dst, int count) = CumulativeSumToAverageRow_C; + int32_t* cumsum_bot_row; + int32_t* max_cumsum_bot_row; + int32_t* cumsum_top_row; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -2221,6 +2660,11 @@ int ARGBBlur(const uint8* src_argb, CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; + } +#endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, @@ -2250,7 +2694,7 @@ int ARGBBlur(const uint8* src_argb, // Increment cumsum_bot_row pointer with circular buffer wrap around and // then fill in a row of CumulativeSum. if ((y + radius) < height) { - const int32* prev_cumsum_bot_row = cumsum_bot_row; + const int32_t* prev_cumsum_bot_row = cumsum_bot_row; cumsum_bot_row += dst_stride32_cumsum; if (cumsum_bot_row >= max_cumsum_bot_row) { cumsum_bot_row = dst_cumsum; @@ -2288,16 +2732,16 @@ int ARGBBlur(const uint8* src_argb, // Multiply ARGB image by a specified ARGB value. LIBYUV_API -int ARGBShade(const uint8* src_argb, +int ARGBShade(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height, - uint32 value) { + uint32_t value) { int y; - void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) = ARGBShadeRow_C; + void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, + uint32_t value) = ARGBShadeRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { return -1; } @@ -2327,6 +2771,11 @@ int ARGBShade(const uint8* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif +#if defined(HAS_ARGBSHADEROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { + ARGBShadeRow = ARGBShadeRow_MMI; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -2338,17 +2787,17 @@ int ARGBShade(const uint8* src_argb, // Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API -int InterpolatePlane(const uint8* src0, +int InterpolatePlane(const uint8_t* src0, int src_stride0, - const uint8* src1, + const uint8_t* src1, int src_stride1, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height, int interpolation) { int y; - void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -2390,14 +2839,6 @@ int InterpolatePlane(const uint8* src0, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) && - IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) && - IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) && - IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -2406,6 +2847,14 @@ int InterpolatePlane(const uint8* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -2418,11 +2867,11 @@ int InterpolatePlane(const uint8* src0, // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, +int ARGBInterpolate(const uint8_t* src_argb0, int src_stride_argb0, - const uint8* src_argb1, + const uint8_t* src_argb1, int src_stride_argb1, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height, @@ -2434,23 +2883,23 @@ int ARGBInterpolate(const uint8* src_argb0, // Interpolate 2 YUV images by specified amount (0 to 255). LIBYUV_API -int I420Interpolate(const uint8* src0_y, +int I420Interpolate(const uint8_t* src0_y, int src0_stride_y, - const uint8* src0_u, + const uint8_t* src0_u, int src0_stride_u, - const uint8* src0_v, + const uint8_t* src0_v, int src0_stride_v, - const uint8* src1_y, + const uint8_t* src1_y, int src1_stride_y, - const uint8* src1_u, + const uint8_t* src1_u, int src1_stride_u, - const uint8* src1_v, + const uint8_t* src1_v, int src1_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height, @@ -2472,16 +2921,16 @@ int I420Interpolate(const uint8* src0_y, // Shuffle ARGB channel order. e.g. BGRA to ARGB. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, +int ARGBShuffle(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, - const uint8* shuffler, + const uint8_t* shuffler, int width, int height) { int y; - void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, - const uint8* shuffler, int width) = ARGBShuffleRow_C; + void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, + const uint8_t* shuffler, int width) = ARGBShuffleRow_C; if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2497,14 +2946,6 @@ int ARGBShuffle(const uint8* src_bgra, height = 1; src_stride_bgra = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHUFFLEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBShuffleRow = ARGBShuffleRow_SSE2; - } - } -#endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; @@ -2537,6 +2978,14 @@ int ARGBShuffle(const uint8* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBShuffleRow = ARGBShuffleRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -2547,23 +2996,23 @@ int ARGBShuffle(const uint8* src_bgra, } // Sobel ARGB effect. -static int ARGBSobelize(const uint8* src_argb, +static int ARGBSobelize(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height, - void (*SobelRow)(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst, + void (*SobelRow)(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst, int width)) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = ARGBToYJRow_C; - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, - int width) = SobelYRow_C; - void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobely, int width) = + void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, + uint8_t* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, + const uint8_t* src_y2, uint8_t* dst_sobely, int width) = SobelXRow_C; const int kEdge = 16; // Extra pixels at start of row for extrude/align. if (!src_argb || !dst_argb || width <= 0 || height == 0) { @@ -2608,6 +3057,14 @@ static int ARGBSobelize(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBToYJRow = ARGBToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_MMI; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2619,6 +3076,16 @@ static int ARGBSobelize(const uint8* src_argb, SobelYRow = SobelYRow_NEON; } #endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif +#if defined(HAS_SOBELYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelYRow = SobelYRow_MMI; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -2629,18 +3096,28 @@ static int ARGBSobelize(const uint8* src_argb, SobelXRow = SobelXRow_NEON; } #endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } +#endif +#if defined(HAS_SOBELXROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelXRow = SobelXRow_MMI; + } +#endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); - uint8* row_sobelx = rows; - uint8* row_sobely = rows + kRowSize; - uint8* row_y = rows + kRowSize * 2; + uint8_t* row_sobelx = rows; + uint8_t* row_sobely = rows + kRowSize; + uint8_t* row_y = rows + kRowSize * 2; // Convert first row. - uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kRowSize; - uint8* row_y2 = row_y1 + kRowSize; + uint8_t* row_y0 = row_y + kEdge; + uint8_t* row_y1 = row_y0 + kRowSize; + uint8_t* row_y2 = row_y1 + kRowSize; ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. @@ -2664,7 +3141,7 @@ static int ARGBSobelize(const uint8* src_argb, // Cycle thru circular queue of 3 row_y buffers. { - uint8* row_yt = row_y0; + uint8_t* row_yt = row_y0; row_y0 = row_y1; row_y1 = row_y2; row_y2 = row_yt; @@ -2679,14 +3156,14 @@ static int ARGBSobelize(const uint8* src_argb, // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, +int ARGBSobel(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { - void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelRow_C; + void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelRow = SobelRow_Any_SSE2; @@ -2711,20 +3188,28 @@ int ARGBSobel(const uint8* src_argb, } } #endif +#if defined(HAS_SOBELROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelRow = SobelRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); } // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, +int ARGBSobelToPlane(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, int width, int height) { - void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_, int width) = SobelToPlaneRow_C; + void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; @@ -2749,6 +3234,14 @@ int ARGBSobelToPlane(const uint8* src_argb, } } #endif +#if defined(HAS_SOBELTOPLANEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelToPlaneRow = SobelToPlaneRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, height, SobelToPlaneRow); } @@ -2756,14 +3249,14 @@ int ARGBSobelToPlane(const uint8* src_argb, // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, +int ARGBSobelXY(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { - void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelXYRow_C; + void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXYRow = SobelXYRow_Any_SSE2; @@ -2788,21 +3281,29 @@ int ARGBSobelXY(const uint8* src_argb, } } #endif +#if defined(HAS_SOBELXYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SobelXYRow = SobelXYRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_MMI; + } + } +#endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); } // Apply a 4x4 polynomial to each ARGB pixel. LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, +int ARGBPolynomial(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, const float* poly, int width, int height) { int y; - void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) = ARGBPolynomialRow_C; if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; @@ -2842,16 +3343,16 @@ int ARGBPolynomial(const uint8* src_argb, // Convert plane of 16 bit shorts to half floats. // Source values are multiplied by scale before storing as half float. LIBYUV_API -int HalfFloatPlane(const uint16* src_y, +int HalfFloatPlane(const uint16_t* src_y, int src_stride_y, - uint16* dst_y, + uint16_t* dst_y, int dst_stride_y, float scale, int width, int height) { int y; - void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) = - HalfFloatRow_C; + void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, + int width) = HalfFloatRow_C; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -2903,6 +3404,14 @@ int HalfFloatPlane(const uint16* src_y, } } #endif +#if defined(HAS_HALFFLOATROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HalfFloatRow = HalfFloatRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); @@ -2912,19 +3421,40 @@ int HalfFloatPlane(const uint16* src_y, return 0; } +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, +int ARGBLumaColorTable(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, - const uint8* luma, + const uint8_t* luma, int width, int height) { int y; void (*ARGBLumaColorTableRow)( - const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, - const uint32 lumacoeff) = ARGBLumaColorTableRow_C; + const uint8_t* src_argb, uint8_t* dst_argb, int width, + const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } @@ -2956,15 +3486,15 @@ int ARGBLumaColorTable(const uint8* src_argb, // Copy Alpha from one ARGB image to another. LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, +int ARGBCopyAlpha(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBCopyAlphaRow_C; + void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBCopyAlphaRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2996,6 +3526,14 @@ int ARGBCopyAlpha(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBCOPYALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBCopyAlphaRow(src_argb, dst_argb, width); @@ -3007,10 +3545,10 @@ int ARGBCopyAlpha(const uint8* src_argb, // Extract just the alpha channel from ARGB. LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, - int src_stride, - uint8* dst_a, - int dst_stride, +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, int width, int height) { if (!src_argb || !dst_a || width <= 0 || height == 0) { @@ -3019,17 +3557,17 @@ int ARGBExtractAlpha(const uint8* src_argb, // Negative height means invert the image. if (height < 0) { height = -height; - src_argb += (height - 1) * src_stride; - src_stride = -src_stride; + src_argb += (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride == width * 4 && dst_stride == width) { + if (src_stride_argb == width * 4 && dst_stride_a == width) { width *= height; height = 1; - src_stride = dst_stride = 0; + src_stride_argb = dst_stride_a = 0; } - void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) = - ARGBExtractAlphaRow_C; + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 @@ -3048,26 +3586,38 @@ int ARGBExtractAlpha(const uint8* src_argb, : ARGBExtractAlphaRow_Any_NEON; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI + : ARGBExtractAlphaRow_Any_MMI; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); - src_argb += src_stride; - dst_a += dst_stride; + src_argb += src_stride_argb; + dst_a += dst_stride_a; } return 0; } // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, +int ARGBCopyYToAlpha(const uint8_t* src_y, int src_stride_y, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = - ARGBCopyYToAlphaRow_C; + void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, + int width) = ARGBCopyYToAlphaRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -3099,6 +3649,14 @@ int ARGBCopyYToAlpha(const uint8* src_y, } } #endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI; + } + } +#endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); @@ -3112,19 +3670,19 @@ int ARGBCopyYToAlpha(const uint8* src_y, // directly. A SplitUVRow_Odd function could copy the remaining chroma. LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, +int YUY2ToNV12(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_uv, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -3160,6 +3718,22 @@ int YUY2ToNV12(const uint8* src_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3192,6 +3766,14 @@ int YUY2ToNV12(const uint8* src_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif { int awidth = halfwidth * 2; @@ -3220,19 +3802,19 @@ int YUY2ToNV12(const uint8* src_yuy2, } LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, +int UYVYToNV12(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_uv, + uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { @@ -3268,6 +3850,22 @@ int UYVYToNV12(const uint8* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif +#if defined(HAS_SPLITUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + SplitUVRow = SplitUVRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_MMI; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3300,6 +3898,14 @@ int UYVYToNV12(const uint8* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif { int awidth = halfwidth * 2; diff --git a/files/source/rotate.cc b/files/source/rotate.cc index 4330884c..d414186a 100644 --- a/files/source/rotate.cc +++ b/files/source/rotate.cc @@ -22,18 +22,18 @@ extern "C" { #endif LIBYUV_API -void TransposePlane(const uint8* src, +void TransposePlane(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { int i = height; #if defined(HAS_TRANSPOSEWX16_MSA) - void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst, + void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx16_C; #else - void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst, + void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx8_C; #endif #if defined(HAS_TRANSPOSEWX8_NEON) @@ -49,6 +49,11 @@ void TransposePlane(const uint8* src, } } #endif +#if defined(HAS_TRANSPOSEWX8_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + TransposeWx8 = TransposeWx8_MMI; + } +#endif #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; @@ -57,16 +62,6 @@ void TransposePlane(const uint8* src, } } #endif -#if defined(HAS_TRANSPOSEWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) && - IS_ALIGNED(src_stride, 4)) { - TransposeWx8 = TransposeWx8_Fast_DSPR2; - } else { - TransposeWx8 = TransposeWx8_DSPR2; - } - } -#endif #if defined(HAS_TRANSPOSEWX16_MSA) if (TestCpuFlag(kCpuHasMSA)) { TransposeWx16 = TransposeWx16_Any_MSA; @@ -100,9 +95,9 @@ void TransposePlane(const uint8* src, } LIBYUV_API -void RotatePlane90(const uint8* src, +void RotatePlane90(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { @@ -115,9 +110,9 @@ void RotatePlane90(const uint8* src, } LIBYUV_API -void RotatePlane270(const uint8* src, +void RotatePlane270(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { @@ -130,20 +125,20 @@ void RotatePlane270(const uint8* src, } LIBYUV_API -void RotatePlane180(const uint8* src, +void RotatePlane180(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src + src_stride * (height - 1); + uint8_t* dst_bot = dst + dst_stride * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; @@ -168,14 +163,6 @@ void RotatePlane180(const uint8* src, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) && - IS_ALIGNED(dst_stride, 4)) { - MirrorRow = MirrorRow_DSPR2; - } -#endif #if defined(HAS_MIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MirrorRow = MirrorRow_Any_MSA; @@ -184,6 +171,14 @@ void RotatePlane180(const uint8* src, } } #endif +#if defined(HAS_MIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + MirrorRow = MirrorRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + MirrorRow = MirrorRow_MMI; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -204,9 +199,9 @@ void RotatePlane180(const uint8* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; +#if defined(HAS_COPYROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI; } #endif @@ -224,22 +219,22 @@ void RotatePlane180(const uint8* src, } LIBYUV_API -void TransposeUV(const uint8* src, +void TransposeUV(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width, int height) { int i = height; #if defined(HAS_TRANSPOSEUVWX16_MSA) - void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a, - int dst_stride_a, uint8* dst_b, int dst_stride_b, + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx16_C; #else - void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a, - int dst_stride_a, uint8* dst_b, int dst_stride_b, + void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; #endif #if defined(HAS_TRANSPOSEUVWX8_NEON) @@ -255,10 +250,12 @@ void TransposeUV(const uint8* src, } } #endif -#if defined(HAS_TRANSPOSEUVWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && - IS_ALIGNED(src_stride, 4)) { - TransposeUVWx8 = TransposeUVWx8_DSPR2; +#if defined(HAS_TRANSPOSEUVWX8_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + TransposeUVWx8 = TransposeUVWx8_Any_MMI; + if (IS_ALIGNED(width, 4)) { + TransposeUVWx8 = TransposeUVWx8_MMI; + } } #endif #if defined(HAS_TRANSPOSEUVWX16_MSA) @@ -299,11 +296,11 @@ void TransposeUV(const uint8* src, } LIBYUV_API -void RotateUV90(const uint8* src, +void RotateUV90(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width, int height) { @@ -315,11 +312,11 @@ void RotateUV90(const uint8* src, } LIBYUV_API -void RotateUV270(const uint8* src, +void RotateUV270(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width, int height) { @@ -334,17 +331,17 @@ void RotateUV270(const uint8* src, // Rotate 180 is a horizontal and vertical flip. LIBYUV_API -void RotateUV180(const uint8* src, +void RotateUV180(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width, int height) { int i; - void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = - MirrorUVRow_C; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorUVRow_C; #if defined(HAS_MIRRORUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_NEON; @@ -355,10 +352,14 @@ void RotateUV180(const uint8* src, MirrorUVRow = MirrorUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) && - IS_ALIGNED(src_stride, 4)) { - MirrorUVRow = MirrorUVRow_DSPR2; +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_MSA; + } +#endif +#if defined(HAS_MIRRORUVROW_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_MMI; } #endif @@ -374,9 +375,9 @@ void RotateUV180(const uint8* src, } LIBYUV_API -int RotatePlane(const uint8* src, +int RotatePlane(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height, @@ -413,17 +414,17 @@ int RotatePlane(const uint8* src, } LIBYUV_API -int I420Rotate(const uint8* src_y, +int I420Rotate(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height, @@ -481,15 +482,75 @@ int I420Rotate(const uint8* src_y, } LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, +int I444Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum libyuv::RotationMode mode) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case libyuv::kRotate0: + // copy frame + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case libyuv::kRotate90: + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case libyuv::kRotate270: + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case libyuv::kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8_t* src_y, int src_stride_y, - const uint8* src_uv, + const uint8_t* src_uv, int src_stride_uv, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int width, int height, diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc index 562096b9..b3baf084 100644 --- a/files/source/rotate_any.cc +++ b/files/source/rotate_any.cc @@ -19,8 +19,8 @@ extern "C" { #endif #define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \ - int width) { \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ int r = width & MASK; \ int n = width - r; \ if (n > 0) { \ @@ -35,20 +35,21 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #ifdef HAS_TRANSPOSEWX8_SSSE3 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif +#ifdef HAS_TRANSPOSEWX8_MMI +TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7) +#endif #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif -#ifdef HAS_TRANSPOSEWX8_DSPR2 -TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7) -#endif #ifdef HAS_TRANSPOSEWX16_MSA TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) #endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \ - int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ int r = width & MASK; \ int n = width - r; \ if (n > 0) { \ @@ -64,8 +65,8 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_DSPR2 -TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) +#ifdef HAS_TRANSPOSEUVWX8_MMI +TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7) #endif #ifdef HAS_TRANSPOSEUVWX16_MSA TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc index ede4eafa..a93fd55f 100644 --- a/files/source/rotate_argb.cc +++ b/files/source/rotate_argb.cc @@ -14,113 +14,110 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" +#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// ARGBScale has a function to copy pixels to a row, striding each source -// pixel by a constant. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) && !defined(__native_client__)) || \ - defined(__i386__)) -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, - int src_stride, - int src_stepx, - uint8* dst_ptr, - int dst_width); -#endif -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SCALEARGBROWDOWNEVEN_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, - int src_stride, - int src_stepx, - uint8* dst_ptr, - int dst_width); -#endif - -void ScaleARGBRowDownEven_C(const uint8* src_ptr, - int, - int src_stepx, - uint8* dst_ptr, - int dst_width); - -static void ARGBTranspose(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, +static void ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, int width, int height) { int i; - int src_pixel_step = src_stride >> 2; - void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, - int src_step, uint8* dst_ptr, int dst_width) = - ScaleARGBRowDownEven_C; + int src_pixel_step = src_stride_argb >> 2; + void (*ScaleARGBRowDownEven)( + const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI; + } } #endif for (i = 0; i < width; ++i) { // column of source to row of dest. - ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); - dst += dst_stride; - src += 4; + ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); + dst_argb += dst_stride_argb; + src_argb += 4; } } -void ARGBRotate90(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, +void ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, int width, int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. - src += src_stride * (height - 1); - src_stride = -src_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + src_argb += src_stride_argb * (height - 1); + src_stride_argb = -src_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate270(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, +void ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, int width, int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. - dst += dst_stride * (width - 1); - dst_stride = -dst_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + dst_argb += dst_stride_argb * (width - 1); + dst_stride_argb = -dst_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate180(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, +void ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, int width, int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); + uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBMirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + CopyRow_C; #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; @@ -153,6 +150,14 @@ void ARGBRotate180(const uint8* src, } } #endif +#if defined(HAS_ARGBMIRRORROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBMirrorRow = ARGBMirrorRow_MMI; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -173,29 +178,24 @@ void ARGBRotate180(const uint8* src, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - ARGBMirrorRow(src, row, width); // Mirror first row into a buffer - ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row - CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last - src += src_stride; - dst += dst_stride; - src_bot -= src_stride; - dst_bot -= dst_stride; + ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + src_bot -= src_stride_argb; + dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); } LIBYUV_API -int ARGBRotate(const uint8* src_argb, +int ARGBRotate(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int width, int height, diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc index 89357e73..ff212ade 100644 --- a/files/source/rotate_common.cc +++ b/files/source/rotate_common.cc @@ -16,9 +16,9 @@ namespace libyuv { extern "C" { #endif -void TransposeWx8_C(const uint8* src, +void TransposeWx8_C(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { int i; @@ -36,11 +36,11 @@ void TransposeWx8_C(const uint8* src, } } -void TransposeUVWx8_C(const uint8* src, +void TransposeUVWx8_C(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { int i; @@ -67,9 +67,9 @@ void TransposeUVWx8_C(const uint8* src, } } -void TransposeWxH_C(const uint8* src, +void TransposeWxH_C(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width, int height) { @@ -82,11 +82,11 @@ void TransposeWxH_C(const uint8* src, } } -void TransposeUVWxH_C(const uint8* src, +void TransposeUVWxH_C(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width, int height) { diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc index 74b48ac4..04e19e29 100644 --- a/files/source/rotate_gcc.cc +++ b/files/source/rotate_gcc.cc @@ -22,9 +22,9 @@ extern "C" { // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8* src, +void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { asm volatile( @@ -112,9 +112,9 @@ void TransposeWx8_SSSE3(const uint8* src, // Transpose 16x8. 64 bit #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8* src, +void TransposeWx8_Fast_SSSE3(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { asm volatile( @@ -255,11 +255,11 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, // Transpose UV 8x8. 64 bit. #if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8* src, +void TransposeUVWx8_SSE2(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { asm volatile( diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc new file mode 100644 index 00000000..f8de6083 --- /dev/null +++ b/files/source/rotate_mmi.cc @@ -0,0 +1,291 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Mips MMI. +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +void TransposeWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; + uint8_t* src_tmp = nullptr; + + __asm__ volatile( + "1: \n\t" + "ldc1 %[tmp12], 0x00(%[src]) \n\t" + "dadd %[src_tmp], %[src], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (00 10 01 11 02 12 03 13) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (04 14 05 15 06 16 07 17) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (20 30 21 31 22 32 23 33) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (24 34 25 35 26 36 27 37) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp4 = (00 10 20 30 01 11 21 31) */ + "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" + /* tmp5 = (02 12 22 32 03 13 23 33) */ + "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" + /* tmp6 = (04 14 24 34 05 15 25 35) */ + "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" + /* tmp7 = (06 16 26 36 07 17 27 37) */ + "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (40 50 41 51 42 52 43 53) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (44 54 45 55 46 56 47 57) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (60 70 61 71 62 72 63 73) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (64 74 65 75 66 76 67 77) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp8 = (40 50 60 70 41 51 61 71) */ + "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" + /* tmp9 = (42 52 62 72 43 53 63 73) */ + "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" + /* tmp10 = (44 54 64 74 45 55 65 75) */ + "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" + /* tmp11 = (46 56 66 76 47 57 67 77) */ + "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" + + /* tmp0 = (00 10 20 30 40 50 60 70) */ + "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" + /* tmp1 = (01 11 21 31 41 51 61 71) */ + "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (02 12 22 32 42 52 62 72) */ + "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" + /* tmp1 = (03 13 23 33 43 53 63 73) */ + "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (04 14 24 34 44 54 64 74) */ + "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" + /* tmp1 = (05 15 25 35 45 55 65 75) */ + "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + /* tmp0 = (06 16 26 36 46 56 66 76) */ + "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" + /* tmp1 = (07 17 27 37 47 57 67 77) */ + "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" + + "dadd %[dst], %[dst], %[dst_stride] \n\t" + "daddi %[src], %[src], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + + : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), + [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), + [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), + [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), + [src_tmp] "+&r"(src_tmp) + : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), + [dst_stride] "r"(dst_stride) + : "memory"); +} + +void TransposeUVWx8_MMI(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; + uint8_t* src_tmp = nullptr; + + __asm__ volatile( + "1: \n\t" + /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ + "ldc1 %[tmp12], 0x00(%[src]) \n\t" + "dadd %[src_tmp], %[src], %[src_stride] \n\t" + /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ + "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" + /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ + "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" + /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ + "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" + /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ + "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ + "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" + /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ + "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" + + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ + "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" + /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ + "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" + "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" + + /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ + "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" + /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ + "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" + + /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ + "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" + /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ + "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" + /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ + "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" + /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ + "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" + + /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ + "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" + /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ + "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ + "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" + /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ + "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ + "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" + /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ + "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ + "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" + /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ + "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" + "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" + "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" + + "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" + "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" + "daddiu %[src], %[src], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), + [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), + [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), + [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), + [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) + : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), + [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) + : "memory"); +} + +#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/rotate_msa.cc b/files/source/rotate_msa.cc index 8907765a..99bdca65 100644 --- a/files/source/rotate_msa.cc +++ b/files/source/rotate_msa.cc @@ -51,9 +51,9 @@ extern "C" { out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ } -void TransposeWx16_C(const uint8* src, +void TransposeWx16_C(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { TransposeWx8_C(src, src_stride, dst, dst_stride, width); @@ -61,11 +61,11 @@ void TransposeWx16_C(const uint8* src, width); } -void TransposeUVWx16_C(const uint8* src, +void TransposeUVWx16_C(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, @@ -74,13 +74,13 @@ void TransposeUVWx16_C(const uint8* src, dst_stride_a, (dst_b + 8), dst_stride_b, width); } -void TransposeWx16_MSA(const uint8* src, +void TransposeWx16_MSA(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { int x; - const uint8* s; + const uint8_t* s; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; @@ -153,15 +153,15 @@ void TransposeWx16_MSA(const uint8* src, } } -void TransposeUVWx16_MSA(const uint8* src, +void TransposeUVWx16_MSA(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { int x; - const uint8* s; + const uint8_t* s; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc index ef5c2356..fdc0dd47 100644 --- a/files/source/rotate_neon.cc +++ b/files/source/rotate_neon.cc @@ -21,40 +21,32 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, +void TransposeWx8_NEON(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %5, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld1.8 {d0}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d1}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d2}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d3}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d4}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d5}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d6}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d7}, [%0] \n" "vtrn.8 d1, d0 \n" @@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0] \n" "add %1, #8 \n" // src += 8 @@ -101,183 +85,138 @@ void TransposeWx8_NEON(const uint8* src, "subs %5, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %5, #8 \n" - "beq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %5, #2 \n" - "blt 3f \n" - - "cmp %5, #4 \n" - "blt 2f \n" - - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[1]}, [%0] \n" - - "mov %0, %3 \n" - - MEMACCESS(6) - "vld1.8 {q3}, [%6] \n" - - "vtbl.8 d4, {d0, d1}, d6 \n" - "vtbl.8 d5, {d0, d1}, d7 \n" - "vtbl.8 d0, {d2, d3}, d6 \n" - "vtbl.8 d1, {d2, d3}, d7 \n" - - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "vst1.32 {d4[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d4[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[1]}, [%0] \n" - - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d0[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d0[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[1]}, [%0] \n" - - "add %1, #4 \n" // src += 4 - "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride - "subs %5, #4 \n" // w -= 4 - "beq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %5, #2 \n" - "blt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.16 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[3]}, [%0] \n" - - "vtrn.8 d0, d1 \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d1}, [%0] \n" - - "add %1, #2 \n" // src += 2 - "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride - "subs %5, #2 \n" // w -= 2 - "beq 4f \n" - - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld1.8 {d0[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[7]}, [%1] \n" - - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst), // %3 - "+r"(dst_stride), // %4 - "+r"(width) // %5 - : "r"(&kVTbl4x4Transpose) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3" - ); + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "vld1.32 {d0[0]}, [%0], %2 \n" + "vld1.32 {d0[1]}, [%0], %2 \n" + "vld1.32 {d1[0]}, [%0], %2 \n" + "vld1.32 {d1[1]}, [%0], %2 \n" + "vld1.32 {d2[0]}, [%0], %2 \n" + "vld1.32 {d2[1]}, [%0], %2 \n" + "vld1.32 {d3[0]}, [%0], %2 \n" + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "vst1.32 {d4[0]}, [%0], %4 \n" + "vst1.32 {d4[1]}, [%0], %4 \n" + "vst1.32 {d5[0]}, [%0], %4 \n" + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + "vst1.32 {d0[0]}, [%0], %4 \n" + "vst1.32 {d0[1]}, [%0], %4 \n" + "vst1.32 {d1[0]}, [%0], %4 \n" + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld1.16 {d0[0]}, [%0], %2 \n" + "vld1.16 {d1[0]}, [%0], %2 \n" + "vld1.16 {d0[1]}, [%0], %2 \n" + "vld1.16 {d1[1]}, [%0], %2 \n" + "vld1.16 {d0[2]}, [%0], %2 \n" + "vld1.16 {d1[2]}, [%0], %2 \n" + "vld1.16 {d0[3]}, [%0], %2 \n" + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%1], %2 \n" + "vld1.8 {d0[1]}, [%1], %2 \n" + "vld1.8 {d0[2]}, [%1], %2 \n" + "vld1.8 {d0[3]}, [%1], %2 \n" + "vld1.8 {d0[4]}, [%1], %2 \n" + "vld1.8 {d0[5]}, [%1], %2 \n" + "vld1.8 {d0[6]}, [%1], %2 \n" + "vld1.8 {d0[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3"); } -static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, - 4, 12, 5, 13, 6, 14, 7, 15}; +static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; -void TransposeUVWx8_NEON(const uint8* src, +void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %7, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld2.8 {d0, d1}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d2, d3}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d4, d5}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d6, d7}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d16, d17}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d18, d19}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d20, d21}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d22, d23}, [%0] \n" "vtrn.8 q1, q0 \n" @@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d18}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d16}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d22}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d20}, [%0] \n" "mov %0, %5 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d19}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d17}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d23}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d21}, [%0] \n" "add %1, #8*2 \n" // src += 8*2 @@ -348,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, "subs %7, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %7, #8 \n" - "beq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %7, #2 \n" - "blt 3f \n" - - "cmp %7, #4 \n" - "blt 2f \n" - - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.64 {d0}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d1}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d2}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d3}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d4}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d5}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d6}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d7}, [%0] \n" - - MEMACCESS(8) - "vld1.8 {q15}, [%8] \n" - - "vtrn.8 q0, q1 \n" - "vtrn.8 q2, q3 \n" - - "vtbl.8 d16, {d0, d1}, d30 \n" - "vtbl.8 d17, {d0, d1}, d31 \n" - "vtbl.8 d18, {d2, d3}, d30 \n" - "vtbl.8 d19, {d2, d3}, d31 \n" - "vtbl.8 d20, {d4, d5}, d30 \n" - "vtbl.8 d21, {d4, d5}, d31 \n" - "vtbl.8 d22, {d6, d7}, d30 \n" - "vtbl.8 d23, {d6, d7}, d31 \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "vst1.32 {d16[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d16[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[1]}, [%0], %4 \n" - - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d20[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d20[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[1]}, [%0] \n" - - "mov %0, %5 \n" - - MEMACCESS(0) - "vst1.32 {d18[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d18[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[1]}, [%0], %6 \n" - - "add %0, %5, #4 \n" - MEMACCESS(0) - "vst1.32 {d22[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d22[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[1]}, [%0] \n" - - "add %1, #4*2 \n" // src += 4 * 2 - "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %7, #4 \n" // w -= 4 - "beq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %7, #2 \n" - "blt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[3], d3[3]}, [%0] \n" - - "vtrn.8 d0, d1 \n" - "vtrn.8 d2, d3 \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d2}, [%0] \n" - - "mov %0, %5 \n" - - MEMACCESS(0) - "vst1.64 {d1}, [%0], %6 \n" - MEMACCESS(0) - "vst1.64 {d3}, [%0] \n" - - "add %1, #2*2 \n" // src += 2 * 2 - "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %7, #2 \n" // w -= 2 - "beq 4f \n" - - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[7], d1[7]}, [%1] \n" - - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" - MEMACCESS(5) - "vst1.64 {d1}, [%5] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst_a), // %3 - "+r"(dst_stride_a), // %4 - "+r"(dst_b), // %5 - "+r"(dst_stride_b), // %6 - "+r"(width) // %7 - : "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "vld1.64 {d0}, [%0], %2 \n" + "vld1.64 {d1}, [%0], %2 \n" + "vld1.64 {d2}, [%0], %2 \n" + "vld1.64 {d3}, [%0], %2 \n" + "vld1.64 {d4}, [%0], %2 \n" + "vld1.64 {d5}, [%0], %2 \n" + "vld1.64 {d6}, [%0], %2 \n" + "vld1.64 {d7}, [%0] \n" + + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + "vst1.32 {d16[0]}, [%0], %4 \n" + "vst1.32 {d16[1]}, [%0], %4 \n" + "vst1.32 {d17[0]}, [%0], %4 \n" + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + "vst1.32 {d20[0]}, [%0], %4 \n" + "vst1.32 {d20[1]}, [%0], %4 \n" + "vst1.32 {d21[0]}, [%0], %4 \n" + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.32 {d18[0]}, [%0], %6 \n" + "vst1.32 {d18[1]}, [%0], %6 \n" + "vst1.32 {d19[0]}, [%0], %6 \n" + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + "vst1.32 {d22[0]}, [%0], %6 \n" + "vst1.32 {d22[1]}, [%0], %6 \n" + "vst1.32 {d23[0]}, [%0], %6 \n" + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.64 {d1}, [%0], %6 \n" + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + "vst1.64 {d0}, [%3] \n" + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc index f52b0ed0..f469baac 100644 --- a/files/source/rotate_neon64.cc +++ b/files/source/rotate_neon64.cc @@ -21,41 +21,32 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, +void TransposeWx8_NEON(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { - const uint8* src_temp; - int64 width64 = (int64)width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %3, %3, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v7.8b}, [%0] \n" "trn2 v16.8b, v0.8b, v1.8b \n" @@ -87,459 +78,345 @@ void TransposeWx8_NEON(const uint8* src, "mov %0, %2 \n" - MEMACCESS(0) "st1 {v17.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v16.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v19.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v18.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v21.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v20.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v23.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %3, %3, #8 \n" // w -= 8 + "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %3, %3, #8 \n" - "b.eq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %3, #2 \n" - "b.lt 3f \n" - - "cmp %3, #4 \n" - "b.lt 2f \n" - - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[3], [%0] \n" - - "mov %0, %2 \n" - - MEMACCESS(4) - "ld1 {v2.16b}, [%4] \n" - - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" - - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "st1 {v3.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[3], [%0] \n" - - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v0.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[3], [%0] \n" - - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %3, %3, #4 \n" // w -= 4 - "b.eq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %3, #2 \n" - "b.lt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[3], [%0] \n" - - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" - - "mov %0, %2 \n" - - MEMACCESS(0) - "st1 {v3.8b}, [%0], %6 \n" - MEMACCESS(0) - "st1 {v2.8b}, [%0] \n" - - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %3, %3, #2 \n" // w -= 2 - "b.eq 4f \n" - - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld1 {v0.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[7], [%1] \n" - - MEMACCESS(2) - "st1 {v0.8b}, [%2] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width64) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast<ptrdiff_t>(src_stride)), // %5 - "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" + + "cmp %w3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" + + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast<ptrdiff_t>(src_stride)), // %5 + "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } -static uint8 kVTbl4x4TransposeDi[32] = { +static const uint8_t kVTbl4x4TransposeDi[32] = { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; -void TransposeUVWx8_NEON(const uint8* src, +void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int width) { - const uint8* src_temp; - int64 width64 = (int64)width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %4, %4, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" - - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.16b}, [%0] \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" - - "mov %0, %2 \n" - - MEMACCESS(0) - "st1 {v16.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[1], [%0] \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "st1 {v20.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v20.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[1], [%0] \n" - - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %4, %4, #8 \n" // w -= 8 - "b.ge 1b \n" - - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %4, %4, #8 \n" - "b.eq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %4, #2 \n" - "b.lt 3f \n" - - "cmp %4, #4 \n" - "b.lt 2f \n" - - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.8b}, [%0] \n" - - MEMACCESS(8) - "ld1 {v30.16b}, [%8], #16 \n" - "ld1 {v31.16b}, [%8] \n" - - "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" - "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" - "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" - "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" - - "mov %0, %2 \n" - - MEMACCESS(0) - "st1 {v16.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[3], [%0], %6 \n" - - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v18.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[3], [%0] \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "st1 {v17.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[3], [%0], %7 \n" - - "add %0, %3, #4 \n" - MEMACCESS(0) - "st1 {v19.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[3], [%0] \n" - - "add %1, %1, #8 \n" // src += 4 * 2 - "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %4, %4, #4 \n" // w -= 4 - "b.eq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %4, #2 \n" - "b.lt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[3], [%0] \n" - - "trn1 v4.8b, v0.8b, v2.8b \n" - "trn2 v5.8b, v0.8b, v2.8b \n" - "trn1 v6.8b, v1.8b, v3.8b \n" - "trn2 v7.8b, v1.8b, v3.8b \n" - - "mov %0, %2 \n" - - MEMACCESS(0) - "st1 {v4.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v6.d}[0], [%0] \n" - - "mov %0, %3 \n" - - MEMACCESS(0) - "st1 {v5.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v7.d}[0], [%0] \n" - - "add %1, %1, #4 \n" // src += 2 * 2 - "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %4, %4, #2 \n" // w -= 2 - "b.eq 4f \n" - - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[7], [%1] \n" - - MEMACCESS(2) - "st1 {v0.d}[0], [%2] \n" - MEMACCESS(3) - "st1 {v1.d}[0], [%3] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width64) // %4 - : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 - "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 - "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v30", "v31" - ); + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" + + "cmp %w4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" + + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" + + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 + "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 + "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc index 93a5c28a..e887dd52 100644 --- a/files/source/rotate_win.cc +++ b/files/source/rotate_win.cc @@ -17,11 +17,11 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -__declspec(naked) void TransposeWx8_SSSE3(const uint8* src, +__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, - uint8* dst, + uint8_t* dst, int dst_stride, int width) { __asm { @@ -112,11 +112,11 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, } } -__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, +__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, int src_stride, - uint8* dst_a, + uint8_t* dst_a, int dst_stride_a, - uint8* dst_b, + uint8_t* dst_b, int dst_stride_b, int w) { __asm { @@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi - // Second round of bit swap. + // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 @@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. + // Third round of bit swap. + // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 diff --git a/files/source/row_any.cc b/files/source/row_any.cc index 1092a9c0..06ca723a 100644 --- a/files/source/row_any.cc +++ b/files/source/row_any.cc @@ -31,25 +31,25 @@ extern "C" { #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) // Any 4 planes to 1 with yuvconstants -#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - const uint8* a_buf, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I422ALPHATOARGBROW_SSSE3 @@ -67,86 +67,117 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) #undef ANY41C // Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ } + +// Merge functions. +#ifdef HAS_MERGERGBROW_SSSE3 +ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_NEON +ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_MMI +ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) +#endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOYUY2ROW_MMI +ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOUYVYROW_MMI +ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif #ifdef HAS_BLENDPLANEROW_SSSE3 ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) #endif +#ifdef HAS_BLENDPLANEROW_MMI +ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) +#endif #undef ANY31 // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants -#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ + SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) +#endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) #endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TORGB24ROW_AVX2 -ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) @@ -175,12 +206,6 @@ ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif -#ifdef HAS_I422TOARGBROW_DSPR2 -ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7) -ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7) -ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7) -ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7) -#endif #ifdef HAS_I422TOARGBROW_MSA ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) @@ -192,22 +217,57 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif #undef ANY31C +// Any 3 planes of 16 bit to 1 with yuvconstants +// TODO(fbarchard): consider sharing this code with ANY31C +#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_SSSE3 +ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#undef ANY31CT + // Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } // Merge functions. @@ -223,7 +283,15 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #ifdef HAS_MERGEUVROW_MSA ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) #endif - +#ifdef HAS_MERGEUVROW_MMI +ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) +#endif +#ifdef HAS_NV21TOYUV24ROW_NEON +ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TOYUV24ROW_AVX2 +ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) @@ -255,12 +323,21 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif +#ifdef HAS_ARGBMULTIPLYROW_MMI +ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBADDROW_MMI +ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBSUBTRACTROW_MMI +ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif @@ -270,6 +347,9 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELROW_MSA ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) #endif +#ifdef HAS_SOBELROW_MMI +ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif @@ -279,6 +359,9 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #ifdef HAS_SOBELTOPLANEROW_MSA ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) #endif +#ifdef HAS_SOBELTOPLANEROW_MMI +ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif @@ -288,24 +371,27 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELXYROW_MSA ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) #endif +#ifdef HAS_SOBELXYROW_MMI +ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants -#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ } // Biplanar to RGB. @@ -318,9 +404,6 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV12TOARGBROW_NEON ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif -#ifdef HAS_NV12TOARGBROW_DSPR2 -ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7) -#endif #ifdef HAS_NV12TOARGBROW_MSA ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif @@ -336,6 +419,24 @@ ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #ifdef HAS_NV21TOARGBROW_MSA ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TORGB24ROW_NEON +ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV21TORGB24ROW_NEON +ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_SSSE3 +ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TORGB24ROW_SSSE3 +ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_AVX2 +ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV21TORGB24ROW_AVX2 +ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif @@ -352,8 +453,8 @@ ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) // Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ memset(temp, 0, 128); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -381,6 +482,15 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) +ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) +ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) +#endif #if defined(HAS_ARGBTORGB565ROW_AVX2) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) #endif @@ -388,6 +498,18 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif #if defined(HAS_J400TOARGBROW_SSE2) ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #endif @@ -437,12 +559,24 @@ ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) #endif +#if defined(HAS_ARGBTORGB24ROW_MMI) +ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) +ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3) +ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) +ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) +ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif #if defined(HAS_RAWTORGB24ROW_MSA) ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) #endif +#if defined(HAS_RAWTORGB24ROW_MMI) +ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) +#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif @@ -474,57 +608,87 @@ ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_MMI +ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYJROW_MMI +ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_BGRATOYROW_MMI +ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #endif +#ifdef HAS_ABGRTOYROW_MMI +ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYROW_MMI +ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #endif +#ifdef HAS_RGB24TOYROW_MMI +ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #endif +#ifdef HAS_RAWTOYROW_MMI +ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_RGB565TOYROW_MSA ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) #endif +#ifdef HAS_RGB565TOYROW_MMI +ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB1555TOYROW_MSA ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) #endif +#ifdef HAS_ARGB1555TOYROW_MMI +ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_ARGB4444TOYROW_MMI +ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7) +#endif #ifdef HAS_YUY2TOYROW_NEON ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif @@ -534,66 +698,66 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_MMI +ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) +#endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_UYVYTOYROW_MMI +ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) +#endif +#ifdef HAS_AYUVTOYROW_NEON +ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_AYUVTOYROW_NEON +ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15) +#endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RGB24TOARGBROW_MSA ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) #endif +#ifdef HAS_RGB24TOARGBROW_MMI +ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RAWTOARGBROW_MSA ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) #endif +#ifdef HAS_RAWTOARGBROW_MMI +ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_RGB565TOARGBROW_MSA ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_RGB565TOARGBROW_MMI +ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB1555TOARGBROW_MMI +ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif -#ifdef HAS_RGB24TOARGBROW_DSPR2 -ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7) -#endif -#ifdef HAS_RAWTOARGBROW_DSPR2 -ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7) -#endif -#ifdef HAS_RGB565TOARGBROW_DSPR2 -ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7) -#endif -#ifdef HAS_ARGB1555TOARGBROW_DSPR2 -ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7) -#endif -#ifdef HAS_ARGB4444TOARGBROW_DSPR2 -ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7) -#endif -#ifdef HAS_BGRATOYROW_DSPR2 -ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7) -#endif -#ifdef HAS_ARGBTOYROW_DSPR2 -ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7) -#endif -#ifdef HAS_ABGRTOYROW_DSPR2 -ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7) -#endif -#ifdef HAS_RGBATOYROW_DSPR2 -ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7) -#endif #ifdef HAS_ARGB4444TOARGBROW_MSA ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB4444TOARGBROW_MMI +ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -612,21 +776,30 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_MMI +ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32) +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MSA +ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MMI +ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7) +#endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. #define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ memset(temp, 0, 64 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -645,33 +818,39 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #ifdef HAS_ARGBCOPYALPHAROW_SSE2 ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBCOPYALPHAROW_MMI +ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1) +#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) #endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI +ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) +#endif #undef ANY11B // Any 1 to 1 with parameter. -#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, param, n); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, param, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, - const uint32, + const uint32_t, 4, 2, 3) @@ -679,7 +858,7 @@ ANY11P(ARGBToRGB565DitherRow_Any_SSE2, #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, - const uint32, + const uint32_t, 4, 2, 7) @@ -687,7 +866,7 @@ ANY11P(ARGBToRGB565DitherRow_Any_AVX2, #if defined(HAS_ARGBTORGB565DITHERROW_NEON) ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, - const uint32, + const uint32_t, 4, 2, 7) @@ -695,64 +874,146 @@ ANY11P(ARGBToRGB565DitherRow_Any_NEON, #if defined(HAS_ARGBTORGB565DITHERROW_MSA) ANY11P(ARGBToRGB565DitherRow_Any_MSA, ARGBToRGB565DitherRow_MSA, - const uint32, + const uint32_t, 4, 2, 7) #endif -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) +#if defined(HAS_ARGBTORGB565DITHERROW_MMI) +ANY11P(ARGBToRGB565DitherRow_Any_MMI, + ARGBToRGB565DitherRow_MMI, + const uint32_t, + 4, + 2, + 3) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7) +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 -ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif #ifdef HAS_ARGBSHUFFLEROW_NEON -ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_MSA -ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7) +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif +#ifdef HAS_ARGBSHUFFLEROW_MMI +ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) +#endif +#undef ANY11P #undef ANY11P // Any 1 to 1 with parameter and shorts. BPP measures in shorts. -#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint16 temp[16 * 2]); \ - memset(temp, 0, 32); /* for msan */ \ +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE temp[32]); \ + SIMD_ALIGNED(DTYPE out[32]); \ + memset(temp, 0, 32 * SBPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, param, n); \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ } \ memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, param, MASK + 1); \ - memcpy(dst_ptr + n, temp + 16, r * BPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, + Convert16To8Row_AVX2, + 2, + 1, + uint16_t, + uint8_t, + 31) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, + Convert8To16Row_SSE2, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, + Convert8To16Row_AVX2, + 1, + 2, + uint8_t, + uint16_t, + 31) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST temp[32]); \ + SIMD_ALIGNED(T out[32]); \ + memset(temp, 0, SBPP * 32); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, param, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ } #ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7) +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) #endif #ifdef HAS_HALFFLOATROW_AVX2 -ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15) +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif #ifdef HAS_HALFFLOATROW_F16C -ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15) -ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15) +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) #endif #ifdef HAS_HALFFLOATROW_NEON -ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7) -ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7) +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) +#endif +#ifdef HAS_HALFFLOATROW_MSA +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) #endif #undef ANY11P16 // Any 1 to 1 with yuvconstants #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ memset(temp, 0, 128); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -782,20 +1043,20 @@ ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \ - int width, int source_y_fraction) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_INTERPOLATEROW_AVX2 @@ -807,18 +1068,18 @@ ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #ifdef HAS_INTERPOLATEROW_NEON ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_DSPR2 -ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3) -#endif #ifdef HAS_INTERPOLATEROW_MSA ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif +#ifdef HAS_INTERPOLATEROW_MMI +ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) +#endif #undef ANY11T // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -842,6 +1103,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif +#ifdef HAS_MIRRORROW_MMI +ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -854,49 +1118,53 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif +#ifdef HAS_ARGBMIRRORROW_MMI +ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) +#endif #undef ANY11M // Any 1 plane. (memset) -#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8 temp[64]); \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, v32, n); \ - } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8_t temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ } #ifdef HAS_SETROW_X86 -ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) +ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) #endif #ifdef HAS_SETROW_NEON -ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) #endif #ifdef HAS_ARGBSETROW_NEON -ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) #endif #ifdef HAS_ARGBSETROW_MSA -ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3) +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) #endif #undef ANY1 // Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ } #ifdef HAS_SPLITUVROW_SSE2 @@ -908,8 +1176,11 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #ifdef HAS_SPLITUVROW_NEON ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif -#ifdef HAS_SPLITUVROW_DSPR2 -ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) +#endif +#ifdef HAS_SPLITUVROW_MMI +ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) @@ -932,14 +1203,47 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOUV422ROW_MMI +ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) +ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) +ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) +#endif #undef ANY12 +// Any 1 to 3. Outputs RGB planes. +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 6]); \ + memset(temp, 0, 16 * 3); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(temp, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ + memcpy(dst_r + n, temp + 16 * 3, r); \ + memcpy(dst_g + n, temp + 16 * 4, r); \ + memcpy(dst_b + n, temp + 16 * 5, r); \ + } + +#ifdef HAS_SPLITRGBROW_SSSE3 +ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_NEON +ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_MMI +ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) +#endif + // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \ - uint8* dst_v, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 4]); \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -987,83 +1291,138 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVROW_MMI +ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVJROW_MMI +ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_BGRATOUVROW_MMI +ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVROW_MMI +ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_RGBATOUVROW_MMI +ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) +#endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RGB24TOUVROW_MSA ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_MMI +ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RAWTOUVROW_MSA ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_MMI +ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_RGB565TOUVROW_MSA ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_MMI +ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_ARGB1555TOUVROW_MSA ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_MMI +ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_ARGB4444TOUVROW_MMI +ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15) +#endif #ifdef HAS_YUY2TOUVROW_NEON ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #endif #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif -#ifdef HAS_BGRATOUVROW_DSPR2 -ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVROW_DSPR2 -ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15) -#endif -#ifdef HAS_RGBATOUVROW_DSPR2 -ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVROW_DSPR2 -ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15) -#endif #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_YUY2TOUVROW_MMI +ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) +#endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_UYVYTOUVROW_MMI +ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) +#endif #undef ANY12S +// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, MASK + 1); \ + memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \ + } + +#ifdef HAS_AYUVTOVUROW_NEON +ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) +ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) +#endif +#undef ANY11S + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_common.cc b/files/source/row_common.cc index bf953eef..8951d003 100644 --- a/files/source/row_common.cc +++ b/files/source/row_common.cc @@ -10,6 +10,7 @@ #include "libyuv/row.h" +#include <stdio.h> #include <string.h> // For memcpy and memset. #include "libyuv/basic_types.h" @@ -23,59 +24,69 @@ extern "C" { #define USE_BRANCHLESS 1 #if USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +static __inline int32_t clamp0(int32_t v) { return ((-(v) >> 31) & (v)); } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (((255 - (v)) >> 31) | (v)) & 255; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (((1023 - (v)) >> 31) | (v)) & 1023; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { int m = v >> 31; return (v + m) ^ m; } #else // USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +static __inline int32_t clamp0(int32_t v) { return (v < 0) ? 0 : v; } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (v > 255) ? 255 : v; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } #endif // USE_BRANCHLESS +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp255(v)); +} + +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp1023(v)); +} -#ifdef LIBYUV_LITTLE_ENDIAN -#define WRITEWORD(p, v) *(uint32*)(p) = v +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v #else -static inline void WRITEWORD(uint8* p, uint32 v) { - p[0] = (uint8)(v & 255); - p[1] = (uint8)((v >> 8) & 255); - p[2] = (uint8)((v >> 16) & 255); - p[3] = (uint8)((v >> 24) & 255); +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); } #endif -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb24[0]; - uint8 g = src_rgb24[1]; - uint8 r = src_rgb24[2]; + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -85,12 +96,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -100,12 +111,12 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { } } -void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_rgb24[0] = b; dst_rgb24[1] = g; dst_rgb24[2] = r; @@ -114,12 +125,14 @@ void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { } } -void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 2) | (g >> 4); dst_argb[2] = (r << 3) | (r >> 2); @@ -129,15 +142,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void ARGB1555ToARGBRow_C(const uint8* src_argb1555, - uint8* dst_argb, +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; - uint8 a = src_argb1555[1] >> 7; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t a = src_argb1555[1] >> 7; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 3) | (g >> 2); dst_argb[2] = (r << 3) | (r >> 2); @@ -147,15 +160,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, } } -void ARGB4444ToARGBRow_C(const uint8* src_argb4444, - uint8* dst_argb, +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; - uint8 a = src_argb4444[1] >> 4; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + uint8_t a = src_argb4444[1] >> 4; dst_argb[0] = (b << 4) | b; dst_argb[1] = (g << 4) | g; dst_argb[2] = (r << 4) | r; @@ -165,12 +178,53 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, } } -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); + dst_argb += 4; + src_ar30 += 4; + } +} + +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); + dst_abgr += 4; + src_ar30 += 4; + } +} + +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = ar30 & 0x3ff; + uint32_t ga = ar30 & 0xc00ffc00; + uint32_t r = (ar30 >> 20) & 0x3ff; + *(uint32_t*)(dst_ab30) = r | ga | (b << 20); + dst_ab30 += 4; + src_ar30 += 4; + } +} + +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = b; dst_rgb[1] = g; dst_rgb[2] = r; @@ -179,12 +233,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = r; dst_rgb[1] = g; dst_rgb[2] = b; @@ -193,25 +247,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 2; - uint8 r1 = src_argb[6] >> 3; + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 2; + uint8_t r1 = src_argb[6] >> 3; WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } @@ -223,20 +277,20 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. -void ARGBToRGB565DitherRow_C(const uint8* src_argb, - uint8* dst_rgb, - const uint32 dither4, +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; - uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; - uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); dst_rgb += 4; @@ -244,112 +298,138 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb, } if (width & 1) { int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 3; - uint8 r1 = src_argb[6] >> 3; - uint8 a1 = src_argb[7] >> 7; - *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 3; + uint8_t r1 = src_argb[6] >> 3; + uint8_t a1 = src_argb[7] >> 7; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); } } -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - uint8 b1 = src_argb[4] >> 4; - uint8 g1 = src_argb[5] >> 4; - uint8 r1 = src_argb[6] >> 4; - uint8 a1 = src_argb[7] >> 4; - *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) | - (g1 << 20) | (r1 << 24) | (a1 << 28); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + uint8_t b1 = src_argb[4] >> 4; + uint8_t g1 = src_argb[5] >> 4; + uint8_t r1 = src_argb[6] >> 4; + uint8_t a1 = src_argb[7] >> 4; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_abgr += 4; + } +} + +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); + uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); + uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); + uint32_t a0 = (src_argb[3] >> 6); + *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_argb += 4; } } -static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; } -static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } -static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } // ARGBToY_C and ARGBToUV_C -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP]) >> \ - 2; \ - uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP]) >> \ - 2; \ - uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP]) >> \ - 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP]) >> \ + 2; \ + uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP]) >> \ + 2; \ + uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP]) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ } MAKEROWY(ARGB, 2, 1, 0, 4) @@ -385,65 +465,65 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 -static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (38 * r + 75 * g + 15 * b + 64) >> 7; } -static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } -static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #define AVGB(a, b) (((a) + (b) + 1) >> 1) // ARGBToYJ_C and ARGBToUVJ_C -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ } MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; b = (b << 3) | (b >> 2); g = (g << 2) | (g >> 4); r = (r << 3) | (r >> 2); @@ -453,12 +533,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { } } -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; b = (b << 3) | (b >> 2); g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -468,12 +548,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { } } -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; b = (b << 4) | b; g = (g << 4) | g; r = (r << 4) | r; @@ -483,29 +563,29 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { } } -void RGB565ToUVRow_C(const uint8* src_rgb565, +void RGB565ToUVRow_C(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b1 = src_rgb565[2] & 0x1f; - uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); - uint8 r1 = src_rgb565[3] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b3 = next_rgb565[2] & 0x1f; - uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); - uint8 r3 = next_rgb565[3] >> 3; - uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b1 = src_rgb565[2] & 0x1f; + uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8_t r1 = src_rgb565[3] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b3 = next_rgb565[2] & 0x1f; + uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8_t r3 = next_rgb565[3] >> 3; + uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 787 -> 888. r = (r << 1) | (r >> 6); dst_u[0] = RGBToU(r, g, b); @@ -516,15 +596,15 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, dst_v += 1; } if (width & 1) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b = (b0 + b2); // 565 * 2 = 676. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b = (b0 + b2); // 565 * 2 = 676. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 676 -> 888 g = (g << 1) | (g >> 6); r = (r << 2) | (r >> 4); @@ -533,29 +613,29 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, } } -void ARGB1555ToUVRow_C(const uint8* src_argb1555, +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b1 = src_argb1555[2] & 0x1f; - uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); - uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; - uint8 b3 = next_argb1555[2] & 0x1f; - uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); - uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; - uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b1 = src_argb1555[2] & 0x1f; + uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8_t b3 = next_argb1555[2] & 0x1f; + uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 777 -> 888. g = (g << 1) | (g >> 6); r = (r << 1) | (r >> 6); @@ -567,15 +647,15 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = next_argb1555[1] >> 3; - uint8 b = (b0 + b2); // 555 * 2 = 666. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = next_argb1555[1] >> 3; + uint8_t b = (b0 + b2); // 555 * 2 = 666. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -584,29 +664,29 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, } } -void ARGB4444ToUVRow_C(const uint8* src_argb4444, +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; + const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b1 = src_argb4444[2] & 0x0f; - uint8 g1 = src_argb4444[2] >> 4; - uint8 r1 = src_argb4444[3] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b3 = next_argb4444[2] & 0x0f; - uint8 g3 = next_argb4444[2] >> 4; - uint8 r3 = next_argb4444[3] & 0x0f; - uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b1 = src_argb4444[2] & 0x0f; + uint8_t g1 = src_argb4444[2] >> 4; + uint8_t r1 = src_argb4444[3] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b3 = next_argb4444[2] & 0x0f; + uint8_t g3 = next_argb4444[2] >> 4; + uint8_t r3 = next_argb4444[3] & 0x0f; + uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -618,15 +698,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b = (b0 + b2); // 444 * 2 = 555. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b = (b0 + b2); // 444 * 2 = 555. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 3) | (b >> 2); // 555 -> 888. g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -635,15 +715,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, } } -void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, - uint8* dst_v, +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; for (x = 0; x < width; ++x) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; + uint8_t ab = src_argb[0]; + uint8_t ag = src_argb[1]; + uint8_t ar = src_argb[2]; dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); src_argb += 4; @@ -652,10 +732,10 @@ void ARGBToUV444Row_C(const uint8* src_argb, } } -void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = src_argb[3]; dst_argb += 4; @@ -664,7 +744,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } // Convert a row of image to Sepia tone. -void ARGBSepiaRow_C(uint8* dst_argb, int width) { +void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -683,9 +763,9 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_C(const uint8* src_argb, - uint8* dst_argb, - const int8* matrix_argb, +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, int width) { int x; for (x = 0; x < width; ++x) { @@ -715,7 +795,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, } // Apply color table to a row of image. -void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -731,7 +813,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } // Apply color table to a row of image. -void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -744,7 +828,7 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } } -void ARGBQuantizeRow_C(uint8* dst_argb, +void ARGBQuantizeRow_C(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, @@ -764,21 +848,21 @@ void ARGBQuantizeRow_C(uint8* dst_argb, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 24 -void ARGBShadeRow_C(const uint8* src_argb, - uint8* dst_argb, +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { - const uint32 b_scale = REPEAT8(value & 0xff); - const uint32 g_scale = REPEAT8((value >> 8) & 0xff); - const uint32 r_scale = REPEAT8((value >> 16) & 0xff); - const uint32 a_scale = REPEAT8(value >> 24); + uint32_t value) { + const uint32_t b_scale = REPEAT8(value & 0xff); + const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); + const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); + const uint32_t a_scale = REPEAT8(value >> 24); int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb[0]); - const uint32 g = REPEAT8(src_argb[1]); - const uint32 r = REPEAT8(src_argb[2]); - const uint32 a = REPEAT8(src_argb[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -793,20 +877,20 @@ void ARGBShadeRow_C(const uint8* src_argb, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb0[0]); - const uint32 g = REPEAT8(src_argb0[1]); - const uint32 r = REPEAT8(src_argb0[2]); - const uint32 a = REPEAT8(src_argb0[3]); - const uint32 b_scale = src_argb1[0]; - const uint32 g_scale = src_argb1[1]; - const uint32 r_scale = src_argb1[2]; - const uint32 a_scale = src_argb1[3]; + const uint32_t b = REPEAT8(src_argb0[0]); + const uint32_t g = REPEAT8(src_argb0[1]); + const uint32_t r = REPEAT8(src_argb0[2]); + const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b_scale = src_argb1[0]; + const uint32_t g_scale = src_argb1[1]; + const uint32_t r_scale = src_argb1[2]; + const uint32_t a_scale = src_argb1[3]; dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -821,9 +905,9 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { @@ -848,9 +932,9 @@ void ARGBAddRow_C(const uint8* src_argb0, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { @@ -874,10 +958,10 @@ void ARGBSubtractRow_C(const uint8* src_argb0, #undef SHADE // Sobel functions which mimics SSSE3. -void SobelXRow_C(const uint8* src_y0, - const uint8* src_y1, - const uint8* src_y2, - uint8* dst_sobelx, +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, int width) { int i; for (i = 0; i < width; ++i) { @@ -891,13 +975,13 @@ void SobelXRow_C(const uint8* src_y0, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobelx[i] = (uint8)(clamp255(sobel)); + dst_sobelx[i] = (uint8_t)(clamp255(sobel)); } } -void SobelYRow_C(const uint8* src_y0, - const uint8* src_y1, - uint8* dst_sobely, +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, int width) { int i; for (i = 0; i < width; ++i) { @@ -911,62 +995,62 @@ void SobelYRow_C(const uint8* src_y0, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobely[i] = (uint8)(clamp255(sobel)); + dst_sobely[i] = (uint8_t)(clamp255(sobel)); } } -void SobelRow_C(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_argb[0] = (uint8)(s); - dst_argb[1] = (uint8)(s); - dst_argb[2] = (uint8)(s); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(s); + dst_argb[1] = (uint8_t)(s); + dst_argb[2] = (uint8_t)(s); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void SobelToPlaneRow_C(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_y[i] = (uint8)(s); + dst_y[i] = (uint8_t)(s); } } -void SobelXYRow_C(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int g = clamp255(r + b); - dst_argb[0] = (uint8)(b); - dst_argb[1] = (uint8)(g); - dst_argb[2] = (uint8)(r); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(b); + dst_argb[1] = (uint8_t)(g); + dst_argb[2] = (uint8_t)(r); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Copy a Y to RGB. int x; for (x = 0; x < width; ++x) { - uint8 y = src_y[0]; + uint8_t y = src_y[0]; dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; dst_argb += 4; @@ -1223,12 +1307,14 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #undef YG // C reference code that mimics the YUV assembly. -static __inline void YuvPixel(uint8 y, - uint8 u, - uint8 v, - uint8* b, - uint8* g, - uint8* r, +// Reads 8 bit YUV and leaves result as 16 bit. + +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; @@ -1259,10 +1345,117 @@ static __inline void YuvPixel(uint8 y, int yg = yuvconstants->kYToRgb[0]; #endif - uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6); - *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6); + *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6); + *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6); +} + +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8_t y, + uint8_t u, + uint8_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; + u = clamp255(u >> 2); + v = clamp255(v >> 2); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); } // Y contribution to R,G,B. Scale and bias. @@ -1270,11 +1463,11 @@ static __inline void YuvPixel(uint8 y, #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(y1 + YGB) >> 6); - *g = Clamp((int32)(y1 + YGB) >> 6); - *r = Clamp((int32)(y1 + YGB) >> 6); +static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { + uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32_t)(y1 + YGB) >> 6); + *g = Clamp((int32_t)(y1 + YGB) >> 6); + *r = Clamp((int32_t)(y1 + YGB) >> 6); } #undef YG @@ -1284,16 +1477,16 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. // TODO(fbarchard): Remove subsampling from Neon. -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 u = (src_u[0] + src_u[1] + 1) >> 1; - uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; + uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; @@ -1312,10 +1505,10 @@ void I444ToARGBRow_C(const uint8* src_y, } } #else -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1332,10 +1525,10 @@ void I444ToARGBRow_C(const uint8* src_y, #endif // Also used for 420 -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1358,11 +1551,105 @@ void I422ToARGBRow_C(const uint8* src_y, } } -void I422AlphaToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* rgb_buf, +// 10 bit YUV to ARGB +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { + uint32_t ar30; + b = b >> 4; // convert 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; + (*(uint32_t*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1386,10 +1673,10 @@ void I422AlphaToARGBRow_C(const uint8* src_y, } } -void I422ToRGB24Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1409,18 +1696,18 @@ void I422ToRGB24Row_C(const uint8* src_y, } } -void I422ToARGB4444Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1431,8 +1718,8 @@ void I422ToARGB4444Row_C(const uint8* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | - (g1 << 20) | (r1 << 24) | 0xf000f000; + *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | 0xf000f000; src_y += 2; src_u += 1; src_v += 1; @@ -1443,22 +1730,22 @@ void I422ToARGB4444Row_C(const uint8* src_y, b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; - *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; + *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; } } -void I422ToARGB1555Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1469,8 +1756,8 @@ void I422ToARGB1555Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | - (g1 << 21) | (r1 << 26) | 0x80008000; + *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | + (g1 << 21) | (r1 << 26) | 0x80008000; src_y += 2; src_u += 1; src_v += 1; @@ -1481,22 +1768,22 @@ void I422ToARGB1555Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; - *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; + *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; } } -void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1507,7 +1794,7 @@ void I422ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = + *(uint32_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_u += 1; @@ -1519,13 +1806,13 @@ void I422ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void NV12ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1547,9 +1834,9 @@ void NV12ToARGBRow_C(const uint8* src_y, } } -void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_vu, - uint8* rgb_buf, +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1571,17 +1858,59 @@ void NV21ToARGBRow_C(const uint8* src_y, } } -void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_uv += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_vu += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); @@ -1592,7 +1921,7 @@ void NV12ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = + *(uint32_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_uv += 2; @@ -1603,12 +1932,12 @@ void NV12ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void YUY2ToARGBRow_C(const uint8* src_yuy2, - uint8* rgb_buf, +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1629,8 +1958,8 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2, } } -void UYVYToARGBRow_C(const uint8* src_uyvy, - uint8* rgb_buf, +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1651,10 +1980,10 @@ void UYVYToARGBRow_C(const uint8* src_uyvy, } } -void I422ToRGBARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; @@ -1677,7 +2006,7 @@ void I422ToRGBARow_C(const uint8* src_y, } } -void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); @@ -1693,7 +2022,7 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { } } -void MirrorRow_C(const uint8* src, uint8* dst, int width) { +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; src += width - 1; for (x = 0; x < width - 1; x += 2) { @@ -1706,7 +2035,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { @@ -1722,10 +2054,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; - const uint32* src32 = (const uint32*)(src); - uint32* dst32 = (uint32*)(dst); + const uint32_t* src32 = (const uint32_t*)(src); + uint32_t* dst32 = (uint32_t*)(dst); src32 += width - 1; for (x = 0; x < width - 1; x += 2) { dst32[x] = src32[0]; @@ -1737,7 +2069,10 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; @@ -1752,9 +2087,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void MergeUVRow_C(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -1770,20 +2105,110 @@ void MergeUVRow_C(const uint8* src_u, } } -void CopyRow_C(const uint8* src, uint8* dst, int count) { +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_r[x] = src_rgb[0]; + dst_g[x] = src_rgb[1]; + dst_b[x] = src_rgb[2]; + src_rgb += 3; + } +} + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_rgb[0] = src_r[x]; + dst_rgb[1] = src_g[x]; + dst_rgb[2] = src_b[x]; + dst_rgb += 3; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; + } +} + +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = clamp255((src_y[x] * scale) >> 16); + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { memcpy(dst, src, count); } -void CopyRow_16_C(const uint16* src, uint16* dst, int count) { +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { memcpy(dst, src, count * 2); } -void SetRow_C(uint8* dst, uint8 v8, int width) { +void SetRow_C(uint8_t* dst, uint8_t v8, int width) { memset(dst, v8, width); } -void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { - uint32* d = (uint32*)(dst_argb); +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { + uint32_t* d = (uint32_t*)(dst_argb); int x; for (x = 0; x < width; ++x) { d[x] = v32; @@ -1791,10 +2216,10 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { } // Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_C(const uint8* src_yuy2, +void YUY2ToUVRow_C(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { // Output a row of UV values, filtering 2 rows of YUY2. int x; @@ -1808,9 +2233,9 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, } // Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { // Output a row of UV values. int x; @@ -1824,7 +2249,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2, } // Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1838,10 +2263,10 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } // Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_C(const uint8* src_uyvy, +void UYVYToUVRow_C(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { // Output a row of UV values. int x; @@ -1855,9 +2280,9 @@ void UYVYToUVRow_C(const uint8* src_uyvy, } // Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { // Output a row of UV values. int x; @@ -1871,7 +2296,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy, } // Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1889,19 +2314,19 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1924,13 +2349,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, } if (width & 1) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1940,10 +2365,10 @@ void ARGBBlendRow_C(const uint8* src_argb0, #undef BLEND #define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 -void BlendPlaneRow_C(const uint8* src0, - const uint8* src1, - const uint8* alpha, - uint8* dst, +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -1964,13 +2389,13 @@ void BlendPlaneRow_C(const uint8* src0, // Multiply source RGB by alpha and store to destination. // This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - uint32 a = src_argb[3]; + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -1988,10 +2413,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } if (width & 1) { - const uint32 b = src_argb[0]; - const uint32 g = src_argb[1]; - const uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; + const uint32_t b = src_argb[0]; + const uint32_t g = src_argb[1]; + const uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -2007,7 +2432,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // Reciprocal method is off by 1 on some values. ie 125 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) -const uint32 fixed_invtbl8[256] = { +const uint32_t fixed_invtbl8[256] = { 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), @@ -2047,14 +2472,16 @@ const uint32 fixed_invtbl8[256] = { T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T -void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; - const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point b = (b * ia) >> 8; g = (g * ia) >> 8; r = (r * ia) >> 8; @@ -2068,11 +2495,11 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ComputeCumulativeSumRow_C(const uint8* row, - int32* cumsum, - const int32* previous_cumsum, +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, int width) { - int32 row_sum[4] = {0, 0, 0, 0}; + int32_t row_sum[4] = {0, 0, 0, 0}; int x; for (x = 0; x < width; ++x) { row_sum[0] += row[x * 4 + 0]; @@ -2086,19 +2513,19 @@ void ComputeCumulativeSumRow_C(const uint8* row, } } -void CumulativeSumToAverageRow_C(const int32* tl, - const int32* bl, +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, int w, int area, - uint8* dst, + uint8_t* dst, int count) { float ooa = 1.0f / area; int i; for (i = 0; i < count; ++i) { - dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); - dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); - dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); - dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); dst += 4; tl += 4; bl += 4; @@ -2107,9 +2534,9 @@ void CumulativeSumToAverageRow_C(const int32* tl, // Copy pixels from rotated source to destination row with a slope. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, +void ARGBAffineRow_C(const uint8_t* src_argb, int src_argb_stride, - uint8* dst_argb, + uint8_t* dst_argb, const float* uv_dudv, int width) { int i; @@ -2120,8 +2547,8 @@ void ARGBAffineRow_C(const uint8* src_argb, for (i = 0; i < width; ++i) { int x = (int)(uv[0]); int y = (int)(uv[1]); - *(uint32*)(dst_argb) = - *(const uint32*)(src_argb + y * src_argb_stride + x * 4); + *(uint32_t*)(dst_argb) = + *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); dst_argb += 4; uv[0] += uv_dudv[2]; uv[1] += uv_dudv[3]; @@ -2129,9 +2556,9 @@ void ARGBAffineRow_C(const uint8* src_argb, } // Blend 2 rows into 1. -static void HalfRow_C(const uint8* src_uv, +static void HalfRow_C(const uint8_t* src_uv, ptrdiff_t src_uv_stride, - uint8* dst_uv, + uint8_t* dst_uv, int width) { int x; for (x = 0; x < width; ++x) { @@ -2139,9 +2566,9 @@ static void HalfRow_C(const uint8* src_uv, } } -static void HalfRow_16_C(const uint16* src_uv, +static void HalfRow_16_C(const uint16_t* src_uv, ptrdiff_t src_uv_stride, - uint16* dst_uv, + uint16_t* dst_uv, int width) { int x; for (x = 0; x < width; ++x) { @@ -2150,14 +2577,14 @@ static void HalfRow_16_C(const uint16* src_uv, } // C version 2x2 -> 2x1. -void InterpolateRow_C(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr1 = src_ptr + src_stride; int x; if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); @@ -2182,14 +2609,14 @@ void InterpolateRow_C(uint8* dst_ptr, } } -void InterpolateRow_16_C(uint16* dst_ptr, - const uint16* src_ptr, +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint16* src_ptr1 = src_ptr + src_stride; + const uint16_t* src_ptr1 = src_ptr + src_stride; int x; if (source_y_fraction == 0) { memcpy(dst_ptr, src_ptr, width * 2); @@ -2212,9 +2639,9 @@ void InterpolateRow_16_C(uint16* dst_ptr, } // Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_C(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; @@ -2224,10 +2651,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, int x; for (x = 0; x < width; ++x) { // To support in-place conversion. - uint8 b = src_argb[index0]; - uint8 g = src_argb[index1]; - uint8 r = src_argb[index2]; - uint8 a = src_argb[index3]; + uint8_t b = src_argb[index0]; + uint8_t g = src_argb[index1]; + uint8_t r = src_argb[index2]; + uint8_t a = src_argb[index3]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -2237,10 +2664,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, } } -void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -2261,10 +2688,10 @@ void I422ToYUY2Row_C(const uint8* src_y, } } -void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -2285,8 +2712,8 @@ void I422ToUYVYRow_C(const uint8* src_y, } } -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { int i; @@ -2316,10 +2743,10 @@ void ARGBPolynomialRow_C(const uint8* src_argb, dr += poly[14] * r3; da += poly[15] * a3; - dst_argb[0] = Clamp((int32)(db)); - dst_argb[1] = Clamp((int32)(dg)); - dst_argb[2] = Clamp((int32)(dr)); - dst_argb[3] = Clamp((int32)(da)); + dst_argb[0] = Clamp((int32_t)(db)); + dst_argb[1] = Clamp((int32_t)(dg)); + dst_argb[2] = Clamp((int32_t)(dr)); + dst_argb[3] = Clamp((int32_t)(da)); src_argb += 4; dst_argb += 4; } @@ -2335,31 +2762,49 @@ void ARGBPolynomialRow_C(const uint8* src_argb, // simply extract the low bits of the exponent and the high // bits of the mantissa from our float and we're done. -void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) { +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +void HalfFloatRow_C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { int i; float mult = 1.9259299444e-34f * scale; for (i = 0; i < width; ++i) { float value = src[i] * mult; - dst[i] = (uint16)((*(uint32_t*)&value) >> 13); + dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); + } +} + +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; } } -void ARGBLumaColorTableRow_C(const uint8* src_argb, - uint8* dst_argb, +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, - uint32 lumacoeff) { - uint32 bc = lumacoeff & 0xff; - uint32 gc = (lumacoeff >> 8) & 0xff; - uint32 rc = (lumacoeff >> 16) & 0xff; + const uint8_t* luma, + uint32_t lumacoeff) { + uint32_t bc = lumacoeff & 0xff; + uint32_t gc = (lumacoeff >> 8) & 0xff; + uint32_t rc = (lumacoeff >> 16) & 0xff; int i; for (i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. - const uint8* luma0 = + const uint8_t* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + luma; - const uint8* luma1; + const uint8_t* luma1; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; @@ -2376,7 +2821,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, } if (width & 1) { // Luminance in rows, color values in columns. - const uint8* luma0 = + const uint8_t* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + luma; dst_argb[0] = luma0[src_argb[0]]; @@ -2386,7 +2831,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, } } -void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[3]; @@ -2399,7 +2844,7 @@ void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { } } -void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst_a[0] = src_argb[3]; @@ -2412,7 +2857,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { } } -void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[0]; @@ -2431,13 +2876,13 @@ void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { #if !(defined(_MSC_VER) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. -void I422ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2452,14 +2897,14 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_SSSE3) -void I422ToARGB1555Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2474,14 +2919,14 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_SSSE3) -void I422ToARGB4444Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2496,13 +2941,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_SSSE3) -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); @@ -2515,14 +2960,102 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, } #endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_SSSE3) +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_AVX2) +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_AVX2) +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_I422TORGB565ROW_AVX2) -void I422ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2541,14 +3074,14 @@ void I422ToRGB565Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_AVX2) -void I422ToARGB1555Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2567,14 +3100,14 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_AVX2) -void I422ToARGB4444Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2593,19 +3126,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TORGB24ROW_AVX2) -void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); - // TODO(fbarchard): ARGBToRGB24Row_AVX2 +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2616,13 +3152,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_AVX2) -void NV12ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); @@ -2639,6 +3175,175 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, } #endif +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fsum = 0.f; + int i; +#if defined(__clang__) +#pragma clang loop vectorize_width(4) +#endif + for (i = 0; i < width; ++i) { + float v = *src++; + fsum += v * v; + *dst++ = v * scale; + } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src++ * scale; + } +} + +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + dst_yuv24[3] = src_vu[0]; // V + dst_yuv24[4] = src_vu[1]; // U + dst_yuv24[5] = src_y[1]; // Y1 + src_y += 2; + src_vu += 2; + dst_yuv24 += 6; // Advance 2 pixels. + } + if (width & 1) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + } +} + +// Filter 2 rows of AYUV UV's (444) into UV (420). +void AYUVToUVRow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width; x += 2) { + dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; + dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + src_ayuv += 8; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 0] + 2) >> + 2; + dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 1] + 2) >> + 2; + } +} + +// Filter 2 rows of AYUV UV's (444) into VU (420). +void AYUVToVURow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + // Output a row of VU values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width; x += 2) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; + src_ayuv += 8; + dst_vu += 2; + } + if (width & 1) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 0] + 2) >> + 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 1] + 2) >> + 2; + } +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_ayuv[2]; // v,u,y,a + src_ayuv += 4; + } +} + +void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t u = src_uv[0]; + uint8_t v = src_uv[1]; + dst_vu[0] = v; + dst_vu[1] = u; + src_uv += 2; + dst_vu += 2; + } +} + +// divide values by weights and provide mask to indicate weight of 0. +void FloatDivToByteRow_C(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_out[x] = Clamp(src_values[x] / src_weights[x]); + dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc index 8735070b..decd3d2e 100644 --- a/files/source/row_gcc.cc +++ b/files/source/row_gcc.cc @@ -22,81 +22,80 @@ extern "C" { #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) -static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; -static vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; -static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA -static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, - 0, 33, 65, 13, 0, 33, 65, 13}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR -static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, - 33, 65, 13, 0, 33, 65, 13, 0}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, - 0, 13, 65, 33, 0, 13, 65, 33}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, - 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { @@ -114,15 +113,15 @@ static const uvec8 kShuffleMaskRAWToRGB24_2 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. -static uvec8 kShuffleMaskARGBToRGB24 = { +static const uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. -static uvec8 kShuffleMaskARGBToRAW = { +static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static uvec8 kShuffleMaskARGBToRGB24_0 = { +static const uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. @@ -153,392 +152,542 @@ static const lvec8 kShuffleNV21 = { #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" - "lea " MEMLEA(0x18,0) ",%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) - MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +#ifdef HAS_ARGBTORGB24ROW_AVX2 +// vpermd for 12+12 to 24 +static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; + +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } +#endif + +#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI +// Shuffle table for converting ARGBToRGB24 +static const ulvec8 kPermARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, + 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, + 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; +static const ulvec8 kPermARGBToRGB24_1 = { + 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, + 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, + 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; +static const ulvec8 kPermARGBToRGB24_2 = { + 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, + 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, + 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kPermARGBToRGB24_0), // %3 + "m"(kPermARGBToRGB24_1), // %4 + "m"(kPermARGBToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); +} +#endif -void ARGBToRGB565DitherRow_SSE2(const uint8* src, - uint8* dst, - const uint32 dither4, +#ifdef HAS_ARGBTORAWROW_AVX2 +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, int width) { asm volatile( "movd %3,%%xmm6 \n" @@ -584,9 +733,9 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToRGB565DitherRow_AVX2(const uint8* src, - uint8* dst, - const uint32 dither4, +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" @@ -629,153 +778,335 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src, } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } -void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_RGB24TOARGBROW_SSSE3 +/* + +ARGBToAR30Row: + +Red Blue +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + +static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32_t kMaskRB10 = 0x3ff003ff; +static const uint32_t kMaskAG10 = 0xc000ff00; +static const uint32_t kMulAG10 = 64 * 65536 + 1028; + +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYJROW_SSSE3 @@ -784,153 +1115,149 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8* src_argb0, +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -939,643 +1266,644 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -void ARGBToUVRow_AVX2(const uint8* src_argb0, +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8* src_argb0, +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUVJ128), // %5 - "m"(kARGBToVJ), // %6 - "m"(kARGBToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUVJ128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToVJ), // %5 - "m"(kARGBToUJ), // %6 - "m"(kAddUVJ128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8* src_argb, - uint8* dst_u, - uint8* dst_v, +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); } #endif // HAS_ARGBTOUV444ROW_SSSE3 -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void BGRAToUVRow_SSSE3(const uint8* src_bgra0, +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, int src_stride_bgra, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ABGRToUVRow_SSSE3(const uint8* src_abgr0, +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, int src_stride_abgr, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8* src_rgba0, +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, int src_stride_rgba, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 -#define READYUV444 \ - "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV444 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV -#define READYUV422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422 10 bit, upsample to 8 UV +// TODO(fbarchard): Consider shufb to replace pack/unpack +// TODO(fbarchard): Consider pmulhuw to replace psraw +// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. +#define READYUV210 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $0x2,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" +#define READYUVA422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" // Read 4 UV from NV12, upsample to 8 UV -#define READNV12 \ - "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV -#define READNV21 \ - "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV21 \ + "movq (%[vu_buf]),%%xmm0 \n" \ + "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. -#define READYUY2 \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2 \ + "movdqu (%[yuy2_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu (%[yuy2_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. -#define READUYVY \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY \ + "movdqu (%[uyvy_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu (%[uyvy_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP(yuvconstants) \ - "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ - "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ - "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ - "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ + "movdqa 192(%[yuvconstants]),%%xmm14 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ +#define YUVTORGB16(yuvconstants) \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ @@ -1591,72 +1919,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, "pmulhuw %%xmm14,%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS \ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ + "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS #endif +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + // Store 8 ARGB values. -#define STOREARGB \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklbw %%xmm5,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm0 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ - "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0,(%[dst_argb]) \n" \ + "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ + "lea 0x20(%[dst_argb]), %[dst_argb] \n" // Store 8 RGBA values. -#define STORERGBA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ - "punpcklbw %%xmm2,%%xmm1 \n" \ - "punpcklbw %%xmm0,%%xmm5 \n" \ - "movdqa %%xmm5,%%xmm0 \n" \ - "punpcklwd %%xmm1,%%xmm5 \n" \ - "punpckhwd %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ - "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" - -void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5,(%[dst_rgba]) \n" \ + "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ + "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" + +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1677,15 +2028,15 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, +void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1706,9 +2057,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" - "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1723,15 +2074,15 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } -void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1752,17 +2103,116 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + #ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -1788,16 +2238,16 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -1817,15 +2267,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } -void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -1846,14 +2296,14 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } -void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -1874,14 +2324,14 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } -void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -1902,16 +2352,16 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } -void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, +void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1932,7 +2382,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1940,96 +2390,113 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 \ - "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210 10 bit, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ - "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ - "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" +#define READYUVA422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV12_AVX2 \ + "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 VU from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV21_AVX2 \ + "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2_AVX2 \ + "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY_AVX2 \ + "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ - "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ - "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ - "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ - "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ - "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ - "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" - -#define YUVTORGB_AVX2(yuvconstants) \ +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ + "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ + "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" + +#define YUVTORGB16_AVX2(yuvconstants) \ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ @@ -2039,13 +2506,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 \ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", @@ -2053,48 +2514,78 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ - "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ - "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ + "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ + "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ + "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ + "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 #endif +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + // Store 16 ARGB values. -#define STOREARGB_AVX2 \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ - "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ - "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ - "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ - "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1,(%[dst_argb]) \n" \ + "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -2116,7 +2607,7 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -2125,10 +2616,10 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, #if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -2151,20 +2642,135 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + #if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -2191,7 +2797,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2201,10 +2807,10 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -2224,11 +2830,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" - "sub $0x10,%[width] \n" - "jg 1b \n" + "vmovdqu %%ymm0,(%[dst_argb]) \n" + "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" + "lea 0x40(%[dst_argb]),%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2236,7 +2842,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -2245,9 +2851,9 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, #if defined(HAS_NV12TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -2268,7 +2874,7 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2278,9 +2884,9 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, #if defined(HAS_NV21TOARGBROW_AVX2) // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -2302,7 +2908,7 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2312,8 +2918,8 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, #if defined(HAS_YUY2TOARGBROW_AVX2) // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -2335,7 +2941,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2345,8 +2951,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, #if defined(HAS_UYVYTOARGBROW_AVX2) // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off @@ -2368,7 +2974,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2376,552 +2982,957 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpslld $0x18,%%ymm4,%%ymm4 \n" +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 - "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" + asm volatile( - LABELALIGN - "1: \n" - MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" + asm volatile( - LABELALIGN - "1: \n" - MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorUVRow_SSSE3(const uint8* src, - uint8* dst_u, - uint8* dst_v, +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %4,%%xmm1 \n" - "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" - "sub %1,%2 \n" + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" + asm volatile( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc" - , "xmm0" - ); + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vmovdqu %3,%%ymm5 \n" + asm volatile( - LABELALIGN - "1: \n" - VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -void MergeUVRow_AVX2(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_SSE2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + // clang-format off asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" + // 16 pixels per loop. LABELALIGN "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" - "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" - "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x20,%3 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on } #endif // HAS_MERGEUVROW_AVX2 -#ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, - int width) { +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" + // 16 pixels per loop. LABELALIGN "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on } -#endif // HAS_MERGEUVROW_SSE2 -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +#ifdef HAS_CONVERT16TO8ROW_AVX2 +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off asm volatile ( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + // 32 pixels per loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "jmp 9f \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// TODO(fbarchard): reduce to SSE2 +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. LABELALIGN - "2: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" "sub $0x20,%2 \n" - "jg 2b \n" - "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + +#ifdef HAS_SPLITRGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, + 2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u}; + +static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, + 3u, 6u, 9u, 12u, 15u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 2u, + 5u, 8u, 11u, 14u}; + +static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 0u, 3u, + 6u, 9u, 12u, 15u}; + +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRGBToR0), // %5 + "m"(kShuffleMaskRGBToR1), // %6 + "m"(kShuffleMaskRGBToR2), // %7 + "m"(kShuffleMaskRGBToG0), // %8 + "m"(kShuffleMaskRGBToG1), // %9 + "m"(kShuffleMaskRGBToG2), // %10 + "m"(kShuffleMaskRGBToB0), // %11 + "m"(kShuffleMaskRGBToB1), // %12 + "m"(kShuffleMaskRGBToB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_SPLITRGBROW_SSSE3 + +#ifdef HAS_MERGERGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, + 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u, 128u, 5u}; +static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, + 128u, 2u, 128u, 128u, 3u, 128u, + 128u, 4u, 128u, 128u}; +static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, + 128u, 128u, 2u, 128u, 128u, 3u, + 128u, 128u, 4u, 128u}; + +static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, + 7u, 128u, 128u, 8u, 128u, 128u, + 9u, 128u, 128u, 10u}; +static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, + 128u, 7u, 128u, 128u, 8u, 128u, + 128u, 9u, 128u, 128u}; +static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, + 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u, 10u, 128u}; + +static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, + 12u, 128u, 128u, 13u, 128u, 128u, + 14u, 128u, 128u, 15u}; +static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, + 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u, 128u}; +static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, + 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u, 128u}; + +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRToRGB0), // %5 + "m"(kShuffleMaskGToRGB0), // %6 + "m"(kShuffleMaskBToRGB0), // %7 + "m"(kShuffleMaskRToRGB1), // %8 + "m"(kShuffleMaskGToRGB1), // %9 + "m"(kShuffleMaskBToRGB1), // %10 + "m"(kShuffleMaskRToRGB2), // %11 + "m"(kShuffleMaskGToRGB2), // %12 + "m"(kShuffleMaskBToRGB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGERGBROW_SSSE3 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_AVX #ifdef HAS_COPYROW_ERMS // Multiple of 1. -void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc"); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ", %%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" - "lea " MEMLEA(0x20, 0) ", %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8, 1) ", %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 @@ -2930,657 +3941,636 @@ static const uvec8 kShuffleAlphaShort_AVX2 = { 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; -void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "vmovdqa %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" - "lea " MEMLEA(0x80, 0) ", %0 \n" - "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20, %2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : "m"(kPermdARGBToY_AVX), // %3 - "m"(kShuffleAlphaShort_AVX2) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" - "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -void SetRow_X86(uint8* dst, uint8 v8, int width) { +void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); - const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. + asm volatile( + + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } -void SetRow_ERMS(uint8* dst, uint8 v8, int width) { +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } -void UYVYToUVRow_SSE2(const uint8* src_uyvy, +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToUVRow_AVX2(const uint8* src_uyvy, +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + asm volatile( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBBLENDROW_SSSE3 @@ -3590,10 +4580,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8* src0, - const uint8* src1, - const uint8* alpha, - uint8* dst, +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3642,10 +4632,10 @@ void BlendPlaneRow_SSSE3(const uint8* src0, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_AVX2(const uint8* src0, - const uint8* src1, - const uint8* alpha, - uint8* dst, +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -3699,50 +4689,50 @@ void BlendPlaneRow_AVX2(const uint8* src0, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha -static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, - 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; -static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 @@ -3752,87 +4742,85 @@ static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBUNATTENUATEROW_SSE2 @@ -3841,114 +4829,111 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, static const uvec8 kUnattenShuffleAlpha_AVX2 = { 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // Unattenuate 8 pixels at a time. -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - "sub %0,%1 \n" - "vbroadcastf128 %5,%%ymm5 \n" + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - // replace VPGATHER - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "movzb " MEMACCESS2(0x13,0) ",%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x17,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x1b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x1f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" - "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" - "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" - "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" - // end of VPGATHER - - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8), // %4 - "m"(kUnattenShuffleAlpha_AVX2) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBUNATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBGRAYROW_SSSE3 @@ -3957,306 +4942,301 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone -static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, - 17, 68, 35, 0, 17, 68, 35, 0}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, - 22, 88, 45, 0, 22, 88, 45, 0}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, - 24, 98, 50, 0, 24, 98, 50, 0}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { - asm volatile ( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBSEPIAROW_SSSE3 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, - const int8* matrix_argb, +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" + asm volatile( + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8* dst_argb, +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { - asm volatile ( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + asm volatile( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { - asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" + uint32_t value) { + asm volatile( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" #if defined(__AVX2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + , + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif ); } @@ -4264,121 +5244,113 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBSUBTRACTROW_AVX2 @@ -4387,55 +5359,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_SSE2(const uint8* src_y0, - const uint8* src_y1, - const uint8* src_y2, - uint8* dst_sobelx, +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 - MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELXROW_SSE2 @@ -4444,52 +5414,50 @@ void SobelXRow_SSE2(const uint8* src_y0, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_SSE2(const uint8* src_y0, - const uint8* src_y1, - uint8* dst_sobely, +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" - MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" - MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELYROW_SSE2 @@ -4499,83 +5467,79 @@ void SobelYRow_SSE2(const uint8* src_y0, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_SOBELTOPLANEROW_SSE2 @@ -4585,1168 +5549,1247 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6," MEMACCESS(2) " \n" - "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_SOBELXYROW_SSE2 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, - int32* cumsum, - const int32* previous_cumsum, +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, int width) { - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop. - LABELALIGN - "10: \n" - "movd " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop. + LABELALIGN + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32* topleft, - const int32* botleft, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, int width, int area, - uint8* dst, + uint8_t* dst, int count) { - asm volatile ( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" - - // 4 pixel small loop. - LABELALIGN - "4: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" - - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - "lea " MEMLEA(0x10,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - "lea " MEMLEA(0x10,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((intptr_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + asm volatile( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop. + LABELALIGN + "4: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, +void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, - uint8* dst_argb, + uint8_t* dst_argb, const float* src_dudv, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm volatile ( - "movq " MEMACCESS(3) ",%%xmm2 \n" - "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" - - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1," MEMACCESS(2) " \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0," MEMACCESS2(0x08,2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" - - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x04,2) ",%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + asm volatile( + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "=&r"(temp) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - MEMOPMEM(movdqu,xmm2,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - asm volatile ( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" - "vbroadcastss %%xmm4,%%ymm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + asm volatile( + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "rep movsb " MEMMOVESTRING(1,0) " \n" - "jmp 999f \n" + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb \n" + "jmp 999f \n" - "99: \n" - "vzeroupper \n" - "999: \n" - : "+D"(dst_ptr), // %0 - "+S"(src_ptr), // %1 - "+cm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" - ); + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+cm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); + asm volatile( + + "movdqu (%3),%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_AVX2(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); + asm volatile( + + "vbroadcastf128 (%3),%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSE2(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, - int width) { - uintptr_t pixel_temp; - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" - "mov " MEMACCESS(4) ",%k2 \n" - "cmp $0x3000102,%k2 \n" - "je 3012f \n" - "cmp $0x10203,%k2 \n" - "je 123f \n" - "cmp $0x30201,%k2 \n" - "je 321f \n" - "cmp $0x2010003,%k2 \n" - "je 2103f \n" +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( - LABELALIGN - "1: \n" - "movzb " MEMACCESS(4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS(1) " \n" - "movzb " MEMACCESS2(0x1,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x1,1) " \n" - "movzb " MEMACCESS2(0x2,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x2,1) " \n" - "movzb " MEMACCESS2(0x3,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x3,1) " \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "sub $0x1,%3 \n" - "jg 1b \n" - "jmp 99f \n" + "sub %1,%2 \n" - LABELALIGN - "123: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm1,%%xmm1 \n" - "pshuflw $0x1b,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 123b \n" - "jmp 99f \n" + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_SSE2 - LABELALIGN - "321: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x39,%%xmm0,%%xmm0 \n" - "pshuflw $0x39,%%xmm0,%%xmm0 \n" - "pshufhw $0x39,%%xmm1,%%xmm1 \n" - "pshuflw $0x39,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 321b \n" - "jmp 99f \n" +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( - LABELALIGN - "2103: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x93,%%xmm0,%%xmm0 \n" - "pshuflw $0x93,%%xmm0,%%xmm0 \n" - "pshufhw $0x93,%%xmm1,%%xmm1 \n" - "pshuflw $0x93,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 2103b \n" - "jmp 99f \n" + "sub %1,%2 \n" - LABELALIGN - "3012: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0xc6,%%xmm0,%%xmm0 \n" - "pshuflw $0xc6,%%xmm0,%%xmm0 \n" - "pshufhw $0xc6,%%xmm1,%%xmm1 \n" - "pshuflw $0xc6,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 3012b \n" - - "99: \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "=&d"(pixel_temp), // %2 - "+r"(width) // %3 - : "r"(shuffler) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } -#endif // HAS_ARGBSHUFFLEROW_SSE2 +#endif // HAS_I422TOUYVYROW_SSE2 -#ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +#ifdef HAS_I422TOYUY2ROW_AVX2 +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(3) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } -#endif // HAS_I422TOYUY2ROW_SSE2 +#endif // HAS_I422TOYUY2ROW_AVX2 -#ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS(3) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } -#endif // HAS_I422TOUYVYROW_SSE2 +#endif // HAS_I422TOUYVYROW_AVX2 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { - asm volatile ( - "pxor %%xmm3,%%xmm3 \n" + asm volatile( - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" - "addps " MEMACCESS(3) ",%%xmm0 \n" - "addps " MEMACCESS(3) ",%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" - "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" - "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" - "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + asm volatile( + "vbroadcastf128 (%3),%%ymm4 \n" + "vbroadcastf128 0x10(%3),%%ymm5 \n" + "vbroadcastf128 0x20(%3),%%ymm6 \n" + "vbroadcastf128 0x30(%3),%%ymm7 \n" - // 2 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels - "lea " MEMLEA(0x8,0) ",%0 \n" - "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats - "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X - "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X - "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X - "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X - "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X - "vcvttps2dq %%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "vmovq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels + "lea 0x8(%0),%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * + // X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 #ifdef HAS_HALFFLOATROW_SSE2 static float kScaleBias = 1.9259299444e-34f; -void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "pshufd $0x0,%3,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "movd %3,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts - "add $0x10,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 - "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm3 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "psrld $0xd,%%xmm2 \n" - "psrld $0xd,%%xmm3 \n" - "packssdw %%xmm3,%%xmm2 \n" - MEMOPMEM(movdqu,xmm2,-0x10,0,1,1) - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 - : "memory", "cc", - "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,-0x10(%0,%1,1) \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(scale) // %3 + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 -void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "vbroadcastss %3, %%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts - "add $0x20,%0 \n" - "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates - "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vpsrld $0xd,%%ymm3,%%ymm3 \n" - "vpsrld $0xd,%%ymm2,%%ymm2 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1) - "sub $0x10,%2 \n" - "jg 1b \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "x"(scale * kScaleBias) // %3 - : "memory", "cc", - "xmm2", "xmm3", "xmm4", "xmm5" - ); + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_F16C -void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "vbroadcastss %3, %%ymm4 \n" - "sub %0,%1 \n" +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints - "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vcvtps2ph $3, %%ymm2, %%xmm2 \n" - "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) - MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) - "add $0x20,%0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "x"(scale) // %3 - : "memory", "cc", - "xmm2", "xmm3", "xmm4" - ); + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4"); } #endif // HAS_HALFFLOATROW_F16C #ifdef HAS_HALFFLOATROW_F16C -void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { - asm volatile ( - "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints - "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtps2ph $3, %%ymm2, %%xmm2 \n" - "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) - MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) - "add $0x20,%0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", - "xmm2", "xmm3" - ); +void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { + asm volatile( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm2", "xmm3"); } #endif // HAS_HALFFLOATROW_F16C #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -void ARGBColorTableRow_X86(uint8* dst_argb, - const uint8* table_argb, +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "movzb " MEMACCESS2(-0x1,0) ",%1 \n" - MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x1,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, - uint32 lumacoeff) { + const uint8_t* luma, + uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm volatile ( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + asm volatile( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(2) ",%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb " MEMACCESS(2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS(3) " \n" - "movzb " MEMACCESS2(0x1,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x1,3) " \n" - "movzb " MEMACCESS2(0x2,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x2,3) " \n" - "movzb " MEMACCESS2(0x3,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x3,3) " \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb " MEMACCESS2(0x4,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x4,3) " \n" - "movzb " MEMACCESS2(0x5,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x5,3) " \n" - "movzb " MEMACCESS2(0x6,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x6,3) " \n" - "movzb " MEMACCESS2(0x7,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x7,3) " \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb " MEMACCESS2(0x8,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x8,3) " \n" - "movzb " MEMACCESS2(0x9,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x9,3) " \n" - "movzb " MEMACCESS2(0xa,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xa,3) " \n" - "movzb " MEMACCESS2(0xb,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xb,3) " \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb " MEMACCESS2(0xc,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xc,3) " \n" - "movzb " MEMACCESS2(0xd,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xd,3) " \n" - "movzb " MEMACCESS2(0xe,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xe,3) " \n" - "movzb " MEMACCESS2(0xf,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xf,3) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "lea " MEMLEA(0x10,3) ",%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#ifdef HAS_NV21TOYUV24ROW_AVX2 + +// begin NV21ToYUV24Row_C avx2 constants +static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00}; + +static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80}; + +static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00}; + +static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, + 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05, + 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, + 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05}; + +static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, + 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, + 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, + 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80}; + +static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, + 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, + 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, + 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f}; + +static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, + 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80, + 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, + 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80}; + +static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, + 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, + 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, + 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a}; + +static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, + 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, + 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, + 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80}; + +// NV21ToYUV24Row_AVX2 +void NV21ToYUV24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + uint8_t* src_y_ptr; + uint64_t src_offset = 0; + uint64_t width64; + + width64 = width; + src_y_ptr = (uint8_t*)src_y; + + asm volatile( + "vmovdqu %5, %%ymm0 \n" // init blend value + "vmovdqu %6, %%ymm1 \n" // init blend value + "vmovdqu %7, %%ymm2 \n" // init blend value + // "sub $0x20, %3 \n" //sub 32 from width for final loop + + LABELALIGN + "1: \n" // label 1 + "vmovdqu (%0,%4), %%ymm3 \n" // src_y + "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 + "vmovdqu (%1), %%ymm5 \n" // src_uv + "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf + "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for + // shuf + "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for + // shuf + "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf + "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for + // shuf + "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 + "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 + "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 + "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 + "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const + "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results + "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h + "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results + "add $0x20, %4 \n" // add to src buffer + // ptr + "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert + "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert + "vmovdqu %%ymm4, (%2) \n" // store dst_yuv + "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h + "add $0x60,%2 \n" // add to dst buffer + // ptr + // "cmp %3, %4 \n" //(width64 - + // 32 bytes) and src_offset + "sub $0x20,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" // sse-avx2 + // transistions + + : "+r"(src_y), //%0 + "+r"(src_vu), //%1 + "+r"(dst_yuv24), //%2 + "+r"(width64), //%3 + "+r"(src_offset) //%4 + : "m"(kBLEND0), //%5 + "m"(kBLEND1), //%6 + "m"(kBLEND2), //%7 + "m"(kSHUF0), //%8 + "m"(kSHUF1), //%9 + "m"(kSHUF2), //%10 + "m"(kSHUF3), //%11 + "m"(kSHUF4), //%12 + "m"(kSHUF5) //%13 + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", + "xmm13", "xmm14", "xmm15"); +} +#endif // HAS_NV21TOYUV24ROW_AVX2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc new file mode 100644 index 00000000..d8726d09 --- /dev/null +++ b/files/source/row_mmi.cc @@ -0,0 +1,6042 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "libyuv/row.h" + +#include <string.h> // For memcpy and memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Mips MMI. +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + const uint64_t mask = 0xff000000ULL; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask] \n\t" + "or %[src1], %[src1], %[mask] \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask] \n\t" + "or %[src1], %[src1], %[mask] \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [mask] "f"(mask) + : "memory"); +} + +void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + uint64_t src0, src1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0xff000000ULL; + const uint64_t mask2 = 0xc6; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask1] \n\t" + "punpcklbh %[src0], %[src0], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask2] \n\t" + "or %[src1], %[src1], %[mask1] \n\t" + "punpcklbh %[src1], %[src1], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask2] \n\t" + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" + + "or %[src0], %[src0], %[mask1] \n\t" + "punpcklbh %[src0], %[src0], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask2] \n\t" + "or %[src1], %[src1], %[mask1] \n\t" + "punpcklbh %[src1], %[src1], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask2] \n\t" + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) + : "memory"); +} + +void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + uint64_t mask0 = 0xc6; + uint64_t mask1 = 0x6c; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" + "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" + "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" + + "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" + "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" + "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" + "punpcklbh %[src1], %[src1], %[zero] \n\t" + "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" + "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" + "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" + "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "pextrh %[ftmp2], %[src1], %[zero] \n\t" + "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[src1], %[src1], %[zero] \n\t" + + "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" + "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" + "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" + + "daddiu %[src_raw], %[src_raw], 0x0c \n\t" + "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), + [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) + : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), + [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), + [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) + : "memory"); +} + +void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[5]; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[c1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) + : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), + [four] "f"(0x04) + : "memory"); +} + +void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[6]; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + uint64_t c4 = 0x0001000100010001; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psrlh %[a], %[src1], %[seven] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "xor %[a], %[a], %[c1] \n\t" + "paddb %[a], %[a], %[c4] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[a] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) + : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), + [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) + : "memory"); +} + +void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + uint64_t ftmp[6]; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psrlh %[a], %[src1], %[four] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "psllh %[src0], %[a], %[four] \n\t" + "or %[a], %[src0], %[a] \n\t" + "packushb %[b], %[b], %[r] \n\t" + "packushb %[g], %[g], %[a] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" + "punpckhhw %[r], %[src0], %[src1] \n\t" + "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" + "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" + "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), + [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) + : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), + [four] "f"(0x04) + : "memory"); +} + +void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" + + "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" + "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" + "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) + : "memory"); +} + +void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + uint64_t mask0 = 0xc6; + uint64_t mask1 = 0x18; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" + "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" + "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" + "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + + "pextrh %[src0], %[ftmp1], %[two] \n\t" + "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" + + "pextrh %[src0], %[ftmp2], %[two] \n\t" + "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" + "pextrh %[src0], %[ftmp2], %[one] \n\t" + "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" + "pextrh %[src0], %[ftmp2], %[zero] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "pinsrh_0 %[src1], %[src1], %[src0] \n\t" + "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "packushb %[src1], %[src1], %[zero] \n\t" + + "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" + "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" + "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), + [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), + [one] "f"(0x01), [two] "f"(0x02) + : "memory"); +} + +void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[two] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[eleven] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), + [eleven] "f"(0x0b) + : "memory"); +} + +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. +void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + uint64_t src0, src1; + uint64_t ftmp[3]; + uint64_t c0 = 0x00ff00ff00ff00ff; + + __asm__ volatile( + "punpcklbh %[dither], %[dither], %[zero] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + + "paddh %[b], %[b], %[dither] \n\t" + "paddh %[g], %[g], %[dither] \n\t" + "paddh %[r], %[r], %[dither] \n\t" + "pcmpgth %[src0], %[b], %[c0] \n\t" + "or %[src0], %[src0], %[b] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "pcmpgth %[src0], %[g], %[c0] \n\t" + "or %[src0], %[src0], %[g] \n\t" + "and %[g], %[src0], %[c0] \n\t" + "pcmpgth %[src0], %[r], %[c0] \n\t" + "or %[src0], %[src0], %[r] \n\t" + "and %[r], %[src0], %[c0] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[two] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[eleven] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), + [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) + : "memory"); +} + +void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + "punpckhbh %[a], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[three] \n\t" + "psrlh %[g], %[g], %[three] \n\t" + "psrlh %[r], %[r], %[three] \n\t" + "psrlh %[a], %[a], %[seven] \n\t" + + "psllh %[g], %[g], %[five] \n\t" + "psllh %[r], %[r], %[ten] \n\t" + "psllh %[a], %[a], %[fifteen] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + "or %[b], %[b], %[a] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), + [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) + : "memory"); +} + +void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + uint64_t src0, src1; + uint64_t ftmp[4]; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" + + "punpcklbh %[b], %[src0], %[src1] \n\t" + "punpckhbh %[g], %[src0], %[src1] \n\t" + "punpcklbh %[src0], %[b], %[g] \n\t" + "punpckhbh %[src1], %[b], %[g] \n\t" + "punpcklbh %[b], %[src0], %[zero] \n\t" + "punpckhbh %[g], %[src0], %[zero] \n\t" + "punpcklbh %[r], %[src1], %[zero] \n\t" + "punpckhbh %[a], %[src1], %[zero] \n\t" + + "psrlh %[b], %[b], %[four] \n\t" + "psrlh %[g], %[g], %[four] \n\t" + "psrlh %[r], %[r], %[four] \n\t" + "psrlh %[a], %[a], %[four] \n\t" + + "psllh %[g], %[g], %[four] \n\t" + "psllh %[r], %[r], %[eight] \n\t" + "psllh %[a], %[a], %[twelve] \n\t" + "or %[b], %[b], %[g] \n\t" + "or %[b], %[b], %[r] \n\t" + "or %[b], %[b], %[a] \n\t" + + "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" + "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x10 \n\t" + "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" + "daddiu %[width], %[width], -0x04 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), + [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) + : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), + [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), + [twelve] "f"(0x0c) + : "memory"); +} + +void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001004200810019; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void ARGBToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0019008100420001; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void BGRAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsrl %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001001900810042; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void ABGRToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" + "dsll %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" + "dsll %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" + "dsll %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" + "dsll %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0042008100190001; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RGBAToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" + "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" + "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" + "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" + "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" + "dsrl %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001004200810019; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest0, dest1, dest2, dest3; + const uint64_t value = 0x1080; + const uint64_t mask = 0x0001001900810042; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[dest0], %[src] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[dest1], %[src] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[dest2], %[src] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "punpcklbh %[src_lo], %[src], %[zero] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "dsll %[src], %[src], %[eight] \n\t" + "punpckhbh %[src_hi], %[src], %[zero] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[dest3], %[src] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3) + : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), + [zero] "f"(0x00) + : "memory"); +} + +void RAWToUVRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x00020070004a0026; + const uint64_t mask_v = 0x0012005e00700002; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" + "dsll %[dest0_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src1], %[src0] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src0], %[src1] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" + "dsll %[dest1_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src1], %[src0] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src0], %[src1] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" + "dsll %[dest2_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src1], %[src0] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src0], %[src1] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" + "dsll %[dest3_v], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "dsll %[src0], %[src0], %[eight] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "paddh %[src0], %[src_lo], %[src_hi] \n\t" + "punpcklbh %[src_lo], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_lo] \n\t" + "dsll %[src1], %[src1], %[eight] \n\t" + "punpckhbh %[src_hi], %[src1], %[zero] \n\t" + "paddh %[src0], %[src0], %[src_hi] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" + "dsll %[src_hi], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src1], %[src0] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src0], %[src1] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + uint64_t src, src_hi, src_lo; + uint64_t dest, dest0, dest1, dest2, dest3; + uint64_t tmp0, tmp1; + const uint64_t shift = 0x07; + const uint64_t value = 0x0040; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00010026004B000FULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + + "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + + "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest2], %[dest2], %[shift] \n\t" + + "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" + "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest3], %[dest3], %[shift] \n\t" + + "packsswh %[tmp0], %[dest0], %[dest1] \n\t" + "packsswh %[tmp1], %[dest2], %[dest3] \n\t" + "packushb %[dest], %[tmp0], %[tmp1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), + [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), + [tmp1] "=&f"(tmp1) + : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), + [width] "r"(width) + : "memory"); +} + +void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src_rgb1; + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x002b0054007f0002; + const uint64_t mask_v = 0x0002007f006b0014; + + __asm__ volatile( + "1: \n\t" + "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest0_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest1_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest2_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[dest3_u], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + + "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" + "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "punpcklbh %[src0], %[src1], %[zero] \n\t" + "punpckhbh %[src1], %[src1], %[zero] \n\t" + "pavgh %[src0], %[src_lo], %[src0] \n\t" + "pavgh %[src1], %[src_hi], %[src1] \n\t" + "pavgh %[src0], %[src0], %[src1] \n\t" + "dsll %[src_lo], %[src0], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), + [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), + [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), + [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), + [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [sixteen] "f"(0x10) + : "memory"); +} + +void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + uint64_t ftmp[11]; + const uint64_t value = 0x1080108010801080; + const uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psrlh %[r], %[src1], %[three] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[two] \n\t" + "psrlh %[src1], %[g], %[four] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), + [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) + : "memory"); +} + +void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + uint64_t ftmp[11]; + const uint64_t value = 0x1080108010801080; + const uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g], %[src1], %[c2] \n\t" + "psllh %[g], %[g], %[three] \n\t" + "or %[g], %[src0], %[g] \n\t" + "and %[r], %[src1], %[c3] \n\t" + "psrlh %[r], %[r], %[two] \n\t" + "psllh %[src0], %[b], %[three] \n\t" + "psrlh %[src1], %[b], %[two] \n\t" + "or %[b], %[src0], %[src1] \n\t" + "psllh %[src0], %[g], %[three] \n\t" + "psrlh %[src1], %[g], %[two] \n\t" + "or %[g], %[src0], %[src1] \n\t" + "psllh %[src0], %[r], %[three] \n\t" + "psrlh %[src1], %[r], %[two] \n\t" + "or %[r], %[src0], %[src1] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), + [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), + [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), + [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) + : "memory"); +} + +void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + uint64_t ftmp[11]; + uint64_t value = 0x1080108010801080; + uint64_t mask = 0x0001004200810019; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest0], %[src0], %[src1] \n\t" + "psrlw %[dest0], %[dest0], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest1], %[src0], %[src1] \n\t" + "psrlw %[dest1], %[dest1], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" + "psrlh %[src1], %[src0], %[eight] \n\t" + "and %[b], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g], %[src0], %[four] \n\t" + "and %[r], %[src1], %[c0] \n\t" + "psllh %[src0], %[b], %[four] \n\t" + "or %[b], %[src0], %[b] \n\t" + "psllh %[src0], %[g], %[four] \n\t" + "or %[g], %[src0], %[g] \n\t" + "psllh %[src0], %[r], %[four] \n\t" + "or %[r], %[src0], %[r] \n\t" + "punpcklhw %[src0], %[b], %[r] \n\t" + "punpcklhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest2], %[src0], %[src1] \n\t" + "psrlw %[dest2], %[dest2], %[eight] \n\t" + + "punpckhhw %[src0], %[b], %[r] \n\t" + "punpckhhw %[src1], %[g], %[value] \n\t" + "punpcklhw %[src_lo], %[src0], %[src1] \n\t" + "punpckhhw %[src_hi], %[src0], %[src1] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" + "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" + "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" + "paddw %[dest3], %[src0], %[src1] \n\t" + "psrlw %[dest3], %[dest3], %[eight] \n\t" + + "packsswh %[src_lo], %[dest0], %[dest1] \n\t" + "packsswh %[src_hi], %[dest2], %[dest3] \n\t" + "packushb %[dest0], %[src_lo], %[src_hi] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" + + "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" + "daddiu %[dst_y], %[dst_y], 0x08 \n\t" + "daddiu %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), + [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), + [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) + : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), + [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), + [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) + : "memory"); +} + +void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[13]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0007000700070007; + __asm__ volatile( + "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest0_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest0_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest1_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest1_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest2_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest2_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest2_v], %[src0], %[c2] \n\t" + "psllh %[dest2_v], %[dest2_v], %[three] \n\t" + "or %[dest2_v], %[src1], %[dest2_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest2_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest3_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "psrlh %[r0], %[dest3_u], %[three] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest3_v], %[src0], %[c2] \n\t" + "psllh %[dest3_v], %[dest3_v], %[three] \n\t" + "or %[dest3_v], %[src1], %[dest3_v] \n\t" + "psrlh %[src0], %[src0], %[three] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest3_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" + "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), + [dest3_v] "=&f"(ftmp[12]) + : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), + [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), + [one] "f"(0x01) + : "memory"); +} + +void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[11]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x001f001f001f001f; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t c2 = 0x0003000300030003; + uint64_t c3 = 0x007c007c007c007c; + __asm__ volatile( + "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest0_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest0_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest1_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest1_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest2_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest2_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest0_v], %[src0], %[c2] \n\t" + "psllh %[dest0_v], %[dest0_v], %[three] \n\t" + "or %[dest0_v], %[src1], %[dest0_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[src0], %[src0], %[five] \n\t" + "and %[g0], %[dest3_u], %[c2] \n\t" + "psllh %[g0], %[g0], %[three] \n\t" + "or %[g0], %[src0], %[g0] \n\t" + "and %[r0], %[dest3_u], %[c3] \n\t" + "psrlh %[r0], %[r0], %[two] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[src1], %[src1], %[five] \n\t" + "and %[dest1_v], %[src0], %[c2] \n\t" + "psllh %[dest1_v], %[dest1_v], %[three] \n\t" + "or %[dest1_v], %[src1], %[dest1_v] \n\t" + "and %[src0], %[src0], %[c3] \n\t" + "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[six] \n\t" + "psllh %[r0], %[src0], %[one] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[six] \n\t" + "psllh %[g0], %[g0], %[one] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" + "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" + "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]) + : [src_argb1555] "r"(src_argb1555), + [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), + [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), + [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), + [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), + [two] "f"(0x02), [one] "f"(0x01) + : "memory"); +} + +void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[13]; + uint64_t value = 0x2020202020202020; + uint64_t mask_u = 0x0026004a00700002; + uint64_t mask_v = 0x00020070005e0012; + uint64_t mask = 0x93; + uint64_t c0 = 0x000f000f000f000f; + uint64_t c1 = 0x00ff00ff00ff00ff; + __asm__ volatile( + "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" + "psrlh %[dest0_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest0_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest0_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest0_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest0_u] \n\t" + "paddh %[g0], %[g0], %[dest0_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest0_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" + "psrlh %[dest1_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest1_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest1_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest1_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest1_u] \n\t" + "paddh %[g0], %[g0], %[dest1_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest1_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" + "psrlh %[dest2_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest2_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest2_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest2_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest2_u] \n\t" + "paddh %[g0], %[g0], %[dest2_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest2_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" + "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" + "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" + "psrlh %[dest3_u], %[src0], %[eight] \n\t" + "and %[b0], %[src0], %[c0] \n\t" + "and %[src0], %[src0], %[c1] \n\t" + "psrlh %[g0], %[src0], %[four] \n\t" + "and %[r0], %[dest3_u], %[c0] \n\t" + "psrlh %[src0], %[src1], %[eight] \n\t" + "and %[dest3_u], %[src1], %[c0] \n\t" + "and %[src1], %[src1], %[c1] \n\t" + "psrlh %[dest3_v], %[src1], %[four] \n\t" + "and %[src0], %[src0], %[c0] \n\t" + "paddh %[b0], %[b0], %[dest3_u] \n\t" + "paddh %[g0], %[g0], %[dest3_v] \n\t" + "paddh %[r0], %[r0], %[src0] \n\t" + "punpcklhw %[src0], %[b0], %[r0] \n\t" + "punpckhhw %[src1], %[b0], %[r0] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" + "psrlh %[b0], %[src0], %[four] \n\t" + "psllh %[r0], %[src0], %[two] \n\t" + "or %[b0], %[b0], %[r0] \n\t" + "psrlh %[r0], %[g0], %[four] \n\t" + "psllh %[g0], %[g0], %[two] \n\t" + "or %[g0], %[g0], %[r0] \n\t" + "punpcklhw %[src0], %[g0], %[value] \n\t" + "punpckhhw %[src1], %[g0], %[value] \n\t" + "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" + "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" + "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" + "punpcklhw %[src0], %[b0], %[g0] \n\t" + "punpckhhw %[src1], %[b0], %[g0] \n\t" + + "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" + "pshufh %[dest3_u], %[src0], %[mask] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[g0], %[src1], %[mask_v] \n\t" + "pshufh %[b0], %[src1], %[mask] \n\t" + "pmaddhw %[b0], %[b0], %[mask_u] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" + "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" + "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" + "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddiu %[width], %[width], -0x10 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), + [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), + [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), + [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), + [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), + [dest3_v] "=&f"(ftmp[12]) + : [src_argb4444] "r"(src_argb4444), + [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), + [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), + [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), + [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), + [two] "f"(0x02) + : "memory"); +} + +void ARGBToUV444Row_MMI(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t ftmp[12]; + const uint64_t value = 0x4040; + const uint64_t mask_u = 0x0026004a00700002; + const uint64_t mask_v = 0x00020070005e0012; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" + "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" + "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" + + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" + "psubw %[dest0_u], %[src0], %[src1] \n\t" + "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" + "psubw %[dest0_v], %[src1], %[src0] \n\t" + "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" + "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" + "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" + "psubw %[dest1_u], %[src0], %[src1] \n\t" + "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" + "psubw %[dest1_v], %[src1], %[src0] \n\t" + "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" + "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" + "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" + "psubw %[dest2_u], %[src0], %[src1] \n\t" + "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" + "psubw %[dest2_v], %[src1], %[src0] \n\t" + "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" + + "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" + "punpcklbh %[src_lo], %[src0], %[zero] \n\t" + "punpckhbh %[src_hi], %[src0], %[zero] \n\t" + "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" + "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" + "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" + "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" + "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" + "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" + "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" + "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" + "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" + + "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" + "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" + "psubw %[dest3_u], %[src0], %[src1] \n\t" + "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" + "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" + "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" + "psubw %[dest3_v], %[src1], %[src0] \n\t" + "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" + + "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" + "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" + "packushb %[dest0_u], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" + "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" + + "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" + "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" + "packushb %[dest0_v], %[src0], %[src1] \n\t" + "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" + "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" + + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" + "daddiu %[dst_u], %[dst_u], 0x08 \n\t" + "daddiu %[dst_v], %[dst_v], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bgtz %[width], 1b \n\t" + : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), + [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), + [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), + [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), + [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), + [dest3_v] "=&f"(ftmp[11]) + : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), + [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), + [eight] "f"(0x08) + : "memory"); +} + +void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; + uint64_t tmp0, tmp1; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x01; + const uint64_t mask2 = 0x00400026004B000FULL; + const uint64_t mask3 = 0xFF000000FF000000ULL; + const uint64_t mask4 = ~mask3; + const uint64_t shift = 0x07; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "and %[src37], %[src], %[mask3] \n\t" + + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" + "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" + "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" + + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" + "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" + "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" + "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask4] \n\t" + "or %[dest], %[dest], %[src37] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), + [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), + [src37] "=&f"(src37) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) + : "memory"); +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { + uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; + uint64_t tmp0, tmp1; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x002300440011ULL; + const uint64_t mask2 = 0x002D00580016ULL; + const uint64_t mask3 = 0x003200620018ULL; + const uint64_t mask4 = 0xFF000000FF000000ULL; + const uint64_t shift = 0x07; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "and %[dest37], %[dest], %[mask4] \n\t" + + "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" + "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" + "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" + "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" + "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" + "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" + "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + + "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" + "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" + "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" + "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" + "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" + "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" + "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "or %[dest], %[dest], %[dest37] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), + [dest] "=&f"(dest) + : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), + [mask4] "f"(mask4), [shift] "f"(shift) + : "memory"); +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, + dest3; + uint64_t matrix, matrix_hi, matrix_lo; + uint64_t tmp0, tmp1; + const uint64_t shift0 = 0x06; + const uint64_t shift1 = 0x08; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + + "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest0], %[tmp0], %[tmp1] \n\t" + "psraw %[dest0], %[dest0], %[shift0] \n\t" + + "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest1], %[tmp0], %[tmp1] \n\t" + "psraw %[dest1], %[dest1], %[shift0] \n\t" + + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + + "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest2], %[tmp0], %[tmp1] \n\t" + "psraw %[dest2], %[dest2], %[shift0] \n\t" + + "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" + "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" + "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" + "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" + "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" + "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" + "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" + "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" + "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" + "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" + "paddw %[dest3], %[tmp0], %[tmp1] \n\t" + "psraw %[dest3], %[dest3], %[shift0] \n\t" + + "packsswh %[tmp0], %[dest0], %[dest1] \n\t" + "packsswh %[tmp1], %[dest2], %[dest3] \n\t" + "packushb %[dest], %[tmp0], %[tmp1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), + [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), + [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) + : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), + [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), + [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) + : "memory"); +} + +void ARGBShadeRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[src] \n\t" + "punpckhbh %[src_hi], %[src], %[src] \n\t" + + "punpcklbh %[value], %[value], %[value] \n\t" + + "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), + [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), + [value] "f"(value), [shift] "f"(shift) + : "memory"); +} + +void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; + uint64_t dest, dest_lo, dest_hi; + const uint64_t mask = 0x0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" + "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" + + "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" + "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), + [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) + : "memory"); +} + +void ARGBAddRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "paddusb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width) + : "memory"); +} + +void ARGBSubtractRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "psubusb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [width] "r"(width) + : "memory"); +} + +// Sobel functions which mimics SSSE3. +void SobelXRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + uint64_t y00 = 0, y10 = 0, y20 = 0; + uint64_t y02 = 0, y12 = 0, y22 = 0; + uint64_t zero = 0x0; + uint64_t sobel = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] + "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] + "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] + "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] + "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" + + "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] + "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" + "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] + "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y20], %[y20], %[zero] \n\t" + + "punpcklbh %[y02], %[y02], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + "punpcklbh %[y22], %[y22], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y10] \n\t" // a+b + "paddh %[y20], %[y20], %[y10] \n\t" // c+b + "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c + + "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub + "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub + "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub + + "pmaxsh %[y10], %[y00], %[y02] \n\t" + "pminsh %[y20], %[y00], %[y02] \n\t" + "psubh %[sobel], %[y10], %[y20] \n\t" // Abs + + "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" + "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" + "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" + "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" + "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" + + "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" + "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" + "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" + "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y20], %[y20], %[zero] \n\t" + + "punpcklbh %[y02], %[y02], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + "punpcklbh %[y22], %[y22], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y10] \n\t" + "paddh %[y20], %[y20], %[y10] \n\t" + "paddh %[y00], %[y00], %[y20] \n\t" + + "paddh %[y02], %[y02], %[y12] \n\t" + "paddh %[y22], %[y22], %[y12] \n\t" + "paddh %[y02], %[y02], %[y22] \n\t" + + "pmaxsh %[y10], %[y00], %[y02] \n\t" + "pminsh %[y20], %[y00], %[y02] \n\t" + "psubh %[y00], %[y10], %[y20] \n\t" + + "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 + "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" + "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" + + "daddiu %[src_y0], %[src_y0], 8 \n\t" + "daddiu %[src_y1], %[src_y1], 8 \n\t" + "daddiu %[src_y2], %[src_y2], 8 \n\t" + "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), + [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) + : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), + [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) + : "memory"); +} + +void SobelYRow_MMI(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + uint64_t y00 = 0, y01 = 0, y02 = 0; + uint64_t y10 = 0, y11 = 0, y12 = 0; + uint64_t zero = 0x0; + uint64_t sobel = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] + "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" + "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] + "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] + "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] + "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" + "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] + "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] + "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y01], %[y01], %[zero] \n\t" + "punpcklbh %[y02], %[y02], %[zero] \n\t" + + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y11], %[y11], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y01] \n\t" // a+b + "paddh %[y02], %[y02], %[y01] \n\t" // c+b + "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c + + "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub + "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub + "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub + + "pmaxsh %[y02], %[y00], %[y10] \n\t" + "pminsh %[y12], %[y00], %[y10] \n\t" + "psubh %[sobel], %[y02], %[y12] \n\t" // Abs + + "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" + "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" + "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" + "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" + "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" + "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" + + "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" + "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" + "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" + "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" + "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" + "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" + + "punpcklbh %[y00], %[y00], %[zero] \n\t" + "punpcklbh %[y01], %[y01], %[zero] \n\t" + "punpcklbh %[y02], %[y02], %[zero] \n\t" + + "punpcklbh %[y10], %[y10], %[zero] \n\t" + "punpcklbh %[y11], %[y11], %[zero] \n\t" + "punpcklbh %[y12], %[y12], %[zero] \n\t" + + "paddh %[y00], %[y00], %[y01] \n\t" + "paddh %[y02], %[y02], %[y01] \n\t" + "paddh %[y00], %[y00], %[y02] \n\t" + + "paddh %[y10], %[y10], %[y11] \n\t" + "paddh %[y12], %[y12], %[y11] \n\t" + "paddh %[y10], %[y10], %[y12] \n\t" + + "pmaxsh %[y02], %[y00], %[y10] \n\t" + "pminsh %[y12], %[y00], %[y10] \n\t" + "psubh %[y00], %[y02], %[y12] \n\t" + + "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 + "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" + "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" + + "daddiu %[src_y0], %[src_y0], 8 \n\t" + "daddiu %[src_y1], %[src_y1], 8 \n\t" + "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), + [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) + : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), + [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero) + : "memory"); +} + +void SobelRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + double temp[3]; + uint64_t c1 = 0xff000000ff000000; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] + "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] + "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" + // s7 s6 s5 s4 s3 s2 s1 s0 = a+b + "paddusb %[t2] , %[t0], %[t1] \n\t" + + // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 + "punpcklbh %[t0], %[t2], %[t2] \n\t" + + // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 + "punpcklbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + // 255 s1 s1 s1 s55 s0 s0 s0 + "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" + + // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 + "punpckhbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + // 255 s3 s3 s3 255 s2 s2 s2 + "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" + + // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 + "punpckhbh %[t0], %[t2], %[t2] \n\t" + + // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 + "punpcklbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" + + // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 + "punpckhbh %[t1], %[t0], %[t0] \n\t" + "or %[t1], %[t1], %[c1] \n\t" + "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" + + "daddiu %[dst_argb], %[dst_argb], 32 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) + : "memory"); +} + +void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + uint64_t tr = 0; + uint64_t tb = 0; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" + "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" + "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] + "paddusb %[tr], %[tr], %[tb] \n\t" // g + "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" + + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [tr] "=&f"(tr), [tb] "=&f"(tb) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_y] "r"(dst_y), [width] "r"(width) + : "memory"); +} + +void SobelXYRow_MMI(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + uint64_t temp[3]; + uint64_t result = 0; + uint64_t gb = 0; + uint64_t cr = 0; + uint64_t c1 = 0xffffffffffffffff; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" + "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] + "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" + "paddusb %[tg] , %[tr], %[tb] \n\t" // g + + // g3 b3 g2 b2 g1 b1 g0 b0 + "punpcklbh %[gb], %[tb], %[tg] \n\t" + // c3 r3 r2 r2 c1 r1 c0 r0 + "punpcklbh %[cr], %[tr], %[c1] \n\t" + // c1 r1 g1 b1 c0 r0 g0 b0 + "punpcklhw %[result], %[gb], %[cr] \n\t" + "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" + // c3 r3 g3 b3 c2 r2 g2 b2 + "punpckhhw %[result], %[gb], %[cr] \n\t" + "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" + + // g7 b7 g6 b6 g5 b5 g4 b4 + "punpckhbh %[gb], %[tb], %[tg] \n\t" + // c7 r7 c6 r6 c5 r5 c4 r4 + "punpckhbh %[cr], %[tr], %[c1] \n\t" + // c5 r5 g5 b5 c4 r4 g4 b4 + "punpcklhw %[result], %[gb], %[cr] \n\t" + "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" + // c7 r7 g7 b7 c6 r6 g6 b6 + "punpckhhw %[result], %[gb], %[cr] \n\t" + "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" + + "daddiu %[dst_argb], %[dst_argb], 32 \n\t" + "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" + "daddiu %[src_sobely], %[src_sobely], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), + [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) + : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), + [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) + : "memory"); +} + +void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { + // Copy a Y to RGB. + uint64_t src, dest; + const uint64_t mask0 = 0x00ffffff00ffffffULL; + const uint64_t mask1 = ~mask0; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src], %[src], %[src] \n\t" + "punpcklhw %[dest], %[src], %[src] \n\t" + "and %[dest], %[dest], %[mask0] \n\t" + "or %[dest], %[dest], %[mask1] \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + + "punpckhhw %[dest], %[src], %[src] \n\t" + "and %[dest], %[dest], %[mask0] \n\t" + "or %[dest], %[dest], %[mask1] \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { + uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x55; + const uint64_t mask2 = 0xAA; + const uint64_t mask3 = 0xFF; + const uint64_t mask4 = 0x4A354A354A354A35ULL; + const uint64_t mask5 = 0x0488048804880488ULL; + const uint64_t shift0 = 0x08; + const uint64_t shift1 = 0x06; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" + + "pshufh %[src], %[src_lo], %[mask0] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_lo], %[mask1] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_lo], %[mask2] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_lo], %[mask3] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_hi], %[mask0] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_hi], %[mask1] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + + "pshufh %[src], %[src_hi], %[mask2] \n\t" + "psllh %[dest_lo], %[src], %[shift0] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src] \n\t" + "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" + "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" + "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" + "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" + "pshufh %[src], %[src_hi], %[mask3] \n\t" + "psllh %[dest_hi], %[src], %[shift0] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src] \n\t" + "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" + "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" + "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" + "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo) + : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), + [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), + [shift1] "f"(shift1), [width] "r"(width) + : "memory"); +} + +void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, src0, src1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x1b; + + src += width - 1; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[source], 0(%[src_ptr]) \n\t" + "gsldrc1 %[source], -7(%[src_ptr]) \n\t" + "punpcklbh %[src0], %[source], %[mask0] \n\t" + "pshufh %[src0], %[src0], %[mask1] \n\t" + "punpckhbh %[src1], %[source], %[mask0] \n\t" + "pshufh %[src1], %[src1], %[mask1] \n\t" + "packushb %[dest], %[src1], %[src0] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), + [src1] "=&f"(src1) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void MirrorUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t src0, src1, dest0, dest1; + const uint64_t mask0 = 0x00ff00ff00ff00ffULL; + const uint64_t mask1 = 0x1b; + const uint64_t shift = 0x08; + + src_uv += (width - 1) << 1; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" + "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" + "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" + "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" + + "and %[dest0], %[src0], %[mask0] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "and %[dest1], %[src1], %[mask0] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" + + "psrlh %[dest0], %[src0], %[shift] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "psrlh %[dest1], %[src1], %[shift] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" + + "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" + "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" + "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), + [src1] "=&f"(src1) + : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), + [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), + [shift] "f"(shift) + : "memory"); +} + +void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + src += (width - 1) * 4; + uint64_t temp = 0x0; + uint64_t shuff = 0x4e; // 01 00 11 10 + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[temp], 3(%[src]) \n\t" + "gsldrc1 %[temp], -4(%[src]) \n\t" + "pshufh %[temp], %[temp], %[shuff] \n\t" + "gssdrc1 %[temp], 0x0(%[dst]) \n\t" + "gssdlc1 %[temp], 0x7(%[dst]) \n\t" + + "daddiu %[src], %[src], -0x08 \n\t" + "daddiu %[dst], %[dst], 0x08 \n\t" + "daddiu %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [temp] "=&f"(temp) + : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) + : "memory"); +} + +void SplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" + + "and %[t2], %[t0], %[c0] \n\t" + "and %[t3], %[t1], %[c0] \n\t" + "packushb %[t2], %[t2], %[t3] \n\t" + "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" + + "psrlh %[t2], %[t0], %[shift] \n\t" + "psrlh %[t3], %[t1], %[shift] \n\t" + "packushb %[t2], %[t2], %[t3] \n\t" + "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" + + "daddiu %[src_uv], %[src_uv], 16 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [t3] "=&f"(temp[3]) + : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +void MergeUVRow_MMI(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + uint64_t temp[3]; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" + "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" + "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" + "punpcklbh %[t2], %[t0], %[t1] \n\t" + "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" + "punpckhbh %[t2], %[t0], %[t1] \n\t" + "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" + "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" + + "daddiu %[src_u], %[src_u], 8 \n\t" + "daddiu %[src_v], %[src_v], 8 \n\t" + "daddiu %[dst_uv], %[dst_uv], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) + : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), + [width] "r"(width) + : "memory"); +} + +void SplitRGBRow_MMI(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + uint64_t src[4]; + uint64_t dest_hi, dest_lo, dest; + + __asm__ volatile( + "1: \n\t" + "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" + "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" + "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" + "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" + "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" + "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" + + "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" + "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" + "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" + "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" + "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" + "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" + "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), + [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), + [dstb_ptr] "r"(dst_b), [width] "r"(width) + : "memory"); +} + +void MergeRGBRow_MMI(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + uint64_t srcr, srcg, srcb, dest; + uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; + const uint64_t temp = 0x0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" + "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" + "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" + "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" + "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" + "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" + + "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" + "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" + "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" + "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" + + "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" + "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" + "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" + "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" + "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" + "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" + "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" + "punpckhwd %[dest], %[dest], %[dest] \n\t" + "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" + "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" + + "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" + "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" + "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), + [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), + [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), + [srcbz_lo] "=&f"(srcbz_lo) + : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), + [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) + : "memory"); +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0xff00ff00ff00ff00; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t temp[3]; + uint64_t data[4]; + uint64_t shift = 0x08; + uint64_t src_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" + "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" + "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c1] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" + "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c1] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), + [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) + : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) + : "memory"); +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + uint64_t c0 = 0xff00ff00ff00ff00; + uint64_t c1 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + uint64_t data[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c1] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "psrlh %[t0], %[t0], %[shift] \n\t" + "psrlh %[t1], %[t1], %[shift] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c1] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), + [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) + : "memory"); +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" + "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) + : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), + [c0] "f"(c0) + : "memory"); +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_MMI(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[3]; + uint64_t data[4]; + uint64_t shift = 0x08; + uint64_t src_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" + "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" + "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c0] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" + + "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" + "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" + "pavgb %[t1], %[t2], %[t1] \n\t" + + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c0] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), + [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), + [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) + : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), + [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), + [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + // Output a row of UV values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t temp[2]; + uint64_t data[4]; + uint64_t shift = 0x08; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d0], %[t0], %[c0] \n\t" + "psrlh %[d1], %[t1], %[shift] \n\t" + + "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "mov.s %[t1], %[t0] \n\t" + "and %[d2], %[t0], %[c0] \n\t" + "psrlh %[d3], %[t1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d2] \n\t" + "packushb %[d1], %[d1], %[d3] \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" + "daddiu %[dst_u], %[dst_u], 8 \n\t" + "daddiu %[dst_v], %[dst_v], 8 \n\t" + "daddiu %[width], %[width], -16 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), + [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), + [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + // Output a row of Y values. + uint64_t c0 = 0x00ff00ff00ff00ff; + uint64_t shift = 0x08; + uint64_t temp[2]; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" + "dsrl %[t0], %[t0], %[shift] \n\t" + "dsrl %[t1], %[t1], %[shift] \n\t" + "and %[t0], %[t0], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "and %[t1], %[t1], %[c0] \n\t" + "packushb %[t0], %[t0], %[t1] \n\t" + "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" + "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" + "daddiu %[dst_y], %[dst_y], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) + : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), + [c0] "f"(c0), [shift] "f"(shift) + : "memory"); +} + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_MMI(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, + dest_lo; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; + const uint64_t mask2 = 0x00FF00FF00FF00FFULL; + const uint64_t mask3 = 0xFF; + const uint64_t mask4 = ~mask1; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" + + "psubush %[alpha], %[mask2], %[src0_lo] \n\t" + "pshufh %[alpha], %[alpha], %[mask3] \n\t" + "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" + + "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" + + "psubush %[alpha], %[mask2], %[src0_hi] \n\t" + "pshufh %[alpha], %[alpha], %[mask3] \n\t" + "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[mask4] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), + [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) + : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), + [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +void BlendPlaneRow_MMI(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + uint64_t source0, source1, dest, alph; + uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, + dest_lo; + uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; + const uint64_t mask2 = 0x00FF00FF00FF00FFULL; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" + "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" + + "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" + "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" + + "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" + "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" + "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" + "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" + "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" + "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" + "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" + + "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" + "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" + "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" + "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + + "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" + "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" + "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" + "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), + [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), + [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), + [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), + [alpha_r] "=&f"(alpha_rev) + : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), + [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; + const uint64_t mask0 = 0xFF; + const uint64_t mask1 = 0xFF000000FF000000ULL; + const uint64_t mask2 = ~mask1; + const uint64_t shift = 0x08; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[src] \n\t" + "punpckhbh %[src_hi], %[src], %[src] \n\t" + + "pshufh %[alpha], %[src_lo], %[mask0] \n\t" + "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" + "pshufh %[alpha], %[src_hi], %[mask0] \n\t" + "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "and %[dest], %[dest], %[mask2] \n\t" + "and %[src], %[src], %[mask1] \n\t" + "or %[dest], %[dest], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), + [width] "r"(width) + : "memory"); +} + +void ComputeCumulativeSumRow_MMI(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int64_t row_sum[2] = {0, 0}; + uint64_t src, dest0, dest1, presrc0, presrc1, dest; + const uint64_t mask = 0x0; + + __asm__ volatile( + "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" + "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" + + "1: \n\t" + "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" + "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" + + "punpcklbh %[src], %[src], %[mask] \n\t" + "punpcklhw %[dest0], %[src], %[mask] \n\t" + "punpckhhw %[dest1], %[src], %[mask] \n\t" + + "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" + "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" + + "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" + "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" + "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" + "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" + + "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" + "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" + + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" + "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x01 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), + [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), + [presrc1] "=&f"(presrc1) + : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), + [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) + : "memory"); +} + +// C version 2x2 -> 2x1. +void InterpolateRow_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + if (source_y_fraction == 0) { + __asm__ volatile( + "1: \n\t" + "ld $t0, 0x0(%[src_ptr]) \n\t" + "sd $t0, 0x0(%[dst_ptr]) \n\t" + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : + : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width) + : "memory"); + return; + } + if (source_y_fraction == 128) { + uint64_t uv = 0x0; + uint64_t uv_stride = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" + "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" + "daddu $t0, %[src_ptr], %[stride] \n\t" + "gsldrc1 %[uv_stride], 0x0($t0) \n\t" + "gsldlc1 %[uv_stride], 0x7($t0) \n\t" + + "pavgb %[uv], %[uv], %[uv_stride] \n\t" + "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" + "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), + [stride] "r"((int64_t)src_stride) + : "memory"); + return; + } + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint64_t temp; + uint64_t data[4]; + uint64_t zero = 0x0; + uint64_t c0 = 0x0080008000800080; + uint64_t fy0 = 0x0100010001000100; + uint64_t shift = 0x8; + __asm__ volatile( + "pshufh %[fy1], %[fy1], %[zero] \n\t" + "psubh %[fy0], %[fy0], %[fy1] \n\t" + "1: \n\t" + "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" + "punpcklbh %[d0], %[t0], %[zero] \n\t" + "punpckhbh %[d1], %[t0], %[zero] \n\t" + "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" + "punpcklbh %[d2], %[t0], %[zero] \n\t" + "punpckhbh %[d3], %[t0], %[zero] \n\t" + + "pmullh %[d0], %[d0], %[fy0] \n\t" + "pmullh %[d2], %[d2], %[fy1] \n\t" + "paddh %[d0], %[d0], %[d2] \n\t" + "paddh %[d0], %[d0], %[c0] \n\t" + "psrlh %[d0], %[d0], %[shift] \n\t" + + "pmullh %[d1], %[d1], %[fy0] \n\t" + "pmullh %[d3], %[d3], %[fy1] \n\t" + "paddh %[d1], %[d1], %[d3] \n\t" + "paddh %[d1], %[d1], %[c0] \n\t" + "psrlh %[d1], %[d1], %[shift] \n\t" + + "packushb %[d0], %[d0], %[d1] \n\t" + "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), + [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) + : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), + [dst_ptr] "r"(dst_ptr), [width] "r"(width), + [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0), + [shift] "f"(shift), [zero] "f"(zero) + : "memory"); +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_MMI(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + uint64_t source, dest0, dest1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | + ((shuffler[2] & 0x03) << 4) | + ((shuffler[3] & 0x03) << 6); + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest0], %[src], %[mask0] \n\t" + "pshufh %[dest0], %[dest0], %[mask1] \n\t" + "punpckhbh %[dest1], %[src], %[mask0] \n\t" + "pshufh %[dest1], %[dest1], %[mask1] \n\t" + "packushb %[dest], %[dest0], %[dest1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void I422ToYUY2Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + uint64_t temp[3]; + uint64_t vu = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] + "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] + "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] + "punpcklbh %[vu], %[tu], %[tv] \n\t" // g + "punpcklbh %[tu], %[ty], %[vu] \n\t" // g + "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" + "punpckhbh %[tu], %[ty], %[vu] \n\t" // g + "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" + "daddiu %[src_y], %[src_y], 8 \n\t" + "daddiu %[src_u], %[src_u], 4 \n\t" + "daddiu %[src_v], %[src_v], 4 \n\t" + "daddiu %[dst_frame], %[dst_frame], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), + [vu] "=&f"(vu) + : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), + [dst_frame] "r"(dst_frame), [width] "r"(width) + : "memory"); +} + +void I422ToUYVYRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + uint64_t temp[3]; + uint64_t vu = 0x0; + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] + "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] + "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] + "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] + "punpcklbh %[vu], %[tu], %[tv] \n\t" // g + "punpcklbh %[tu], %[vu], %[ty] \n\t" // g + "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" + "punpckhbh %[tu], %[vu], %[ty] \n\t" // g + "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" + "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" + "daddiu %[src_y], %[src_y], 8 \n\t" + "daddiu %[src_u], %[src_u], 4 \n\t" + "daddiu %[src_v], %[src_v], 4 \n\t" + "daddiu %[dst_frame], %[dst_frame], 16 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" + : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), + [vu] "=&f"(vu) + : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), + [dst_frame] "r"(dst_frame), [width] "r"(width) + : "memory"); +} + +void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, dest; + const uint64_t mask0 = 0xff000000ff000000ULL; + const uint64_t mask1 = ~mask0; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "and %[src], %[src], %[mask0] \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[src], %[dest] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; + const uint64_t mask = 0xff000000ff000000ULL; + const uint64_t shift = 0x18; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "and %[dest0], %[src], %[mask] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" + "and %[dest1], %[src], %[mask] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + + "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" + "and %[dest0], %[src], %[mask] \n\t" + "psrlw %[dest0], %[dest0], %[shift] \n\t" + "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" + "and %[dest1], %[src], %[mask] \n\t" + "psrlw %[dest1], %[dest1], %[shift] \n\t" + "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), + [shift] "f"(shift), [width] "r"(width) + : "memory"); +} + +void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { + uint64_t source, dest0, dest1, dest; + const uint64_t mask0 = 0x0; + const uint64_t mask1 = 0x00ffffff00ffffffULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest0], %[mask0], %[src] \n\t" + "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "punpckhbh %[dest0], %[mask0], %[src] \n\t" + "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" + "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" + "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + "and %[dest], %[dest], %[mask1] \n\t" + "or %[dest], %[dest], %[dest1] \n\t" + "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), + [mask1] "f"(mask1), [width] "r"(width) + : "memory"); +} + +#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc index f79de1c7..5c0239a3 100644 --- a/files/source/row_msa.cc +++ b/files/source/row_msa.cc @@ -37,17 +37,17 @@ extern "C" { } // Load YUV 422 pixel data -#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ - { \ - uint64 y_m; \ - uint32 u_m, v_m; \ - v4i32 zero_m = {0}; \ - y_m = LD(psrc_y); \ - u_m = LW(psrc_u); \ - v_m = LW(psrc_v); \ - out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ - out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ - out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m; \ + uint32_t u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ } // Clip input vector elements between 0 to 255 @@ -163,14 +163,14 @@ extern "C" { v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ v8u16 reg8_m, reg9_m; \ \ - src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ - src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ - src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ - src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ - src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ - src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ - src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ - src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ + src0_m = (v16u8)__msa_ld_b((void*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((void*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((void*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((void*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((void*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((void*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((void*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((void*)t, 48); \ vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ @@ -201,14 +201,14 @@ extern "C" { reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ - src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ - src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ - src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ - src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ - src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ - src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ - src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ + src0_m = (v16u8)__msa_ld_b((void*)s, 64); \ + src1_m = (v16u8)__msa_ld_b((void*)s, 80); \ + src2_m = (v16u8)__msa_ld_b((void*)s, 96); \ + src3_m = (v16u8)__msa_ld_b((void*)s, 112); \ + src4_m = (v16u8)__msa_ld_b((void*)t, 64); \ + src5_m = (v16u8)__msa_ld_b((void*)t, 80); \ + src6_m = (v16u8)__msa_ld_b((void*)t, 96); \ + src7_m = (v16u8)__msa_ld_b((void*)t, 112); \ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ @@ -275,17 +275,17 @@ extern "C" { // Load I444 pixel data #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ { \ - uint64 y_m, u_m, v_m; \ + uint64_t y_m, u_m, v_m; \ v2i64 zero_m = {0}; \ y_m = LD(psrc_y); \ u_m = LD(psrc_u); \ v_m = LD(psrc_v); \ - out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \ - out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \ - out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ } -void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; @@ -302,7 +302,7 @@ void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { } } -void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; @@ -319,10 +319,10 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { } } -void I422ToYUY2Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, int width) { int x; v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; @@ -343,10 +343,10 @@ void I422ToYUY2Row_MSA(const uint8* src_y, } } -void I422ToUYVYRow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, int width) { int x; v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; @@ -367,10 +367,10 @@ void I422ToUYVYRow_MSA(const uint8* src_y, } } -void I422ToARGBRow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; @@ -390,18 +390,18 @@ void I422ToARGBRow_MSA(const uint8* src_y, src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_y += 8; src_u += 4; src_v += 4; - rgb_buf += 32; + dst_argb += 32; } } -void I422ToRGBARow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; @@ -421,23 +421,23 @@ void I422ToRGBARow_MSA(const uint8* src_y, src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); - STOREARGB(alpha, vec0, vec1, vec2, rgb_buf); + STOREARGB(alpha, vec0, vec1, vec2, dst_argb); src_y += 8; src_u += 4; src_v += 4; - rgb_buf += 32; + dst_argb += 32; } } -void I422AlphaToARGBRow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* rgb_buf, +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; - int64 data_a; + int64_t data_a; v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; @@ -457,23 +457,23 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); - STOREARGB(vec0, vec1, vec2, src3, rgb_buf); + STOREARGB(vec0, vec1, vec2, src3, dst_argb); src_y += 8; src_u += 4; src_v += 4; src_a += 8; - rgb_buf += 32; + dst_argb += 32; } } -void I422ToRGB24Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, - int32 width) { + int32_t width) { int x; - int64 data_u, data_v; + int64_t data_u, data_v; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; @@ -510,20 +510,20 @@ void I422ToRGB24Row_MSA(const uint8* src_y, dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); - ST_UB2(dst0, dst1, rgb_buf, 16); - ST_UB(dst2, (rgb_buf + 32)); + ST_UB2(dst0, dst1, dst_argb, 16); + ST_UB(dst2, (dst_argb + 32)); src_y += 16; src_u += 8; src_v += 8; - rgb_buf += 48; + dst_argb += 48; } } // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void I422ToRGB565Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { int x; @@ -558,10 +558,10 @@ void I422ToRGB565Row_MSA(const uint8* src_y, } // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. -void I422ToARGB4444Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { int x; @@ -598,10 +598,10 @@ void I422ToARGB4444Row_MSA(const uint8* src_y, } } -void I422ToARGB1555Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { int x; @@ -638,7 +638,7 @@ void I422ToARGB1555Row_MSA(const uint8* src_y, } } -void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; @@ -652,12 +652,12 @@ void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { } } -void YUY2ToUVRow_MSA(const uint8* src_yuy2, +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2; + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, dst0, dst1; @@ -682,9 +682,9 @@ void YUY2ToUVRow_MSA(const uint8* src_yuy2, } } -void YUY2ToUV422Row_MSA(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; @@ -703,7 +703,7 @@ void YUY2ToUV422Row_MSA(const uint8* src_yuy2, } } -void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; @@ -717,12 +717,12 @@ void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { } } -void UYVYToUVRow_MSA(const uint8* src_uyvy, +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy; + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, dst0, dst1; @@ -747,9 +747,9 @@ void UYVYToUVRow_MSA(const uint8* src_uyvy, } } -void UYVYToUV422Row_MSA(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; @@ -768,7 +768,7 @@ void UYVYToUV422Row_MSA(const uint8* src_uyvy, } } -void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -814,13 +814,13 @@ void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void ARGBToUVRow_MSA(const uint8* src_argb0, +void ARGBToUVRow_MSA(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* src_argb0_next = src_argb0 + src_stride_argb; + const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; @@ -932,7 +932,7 @@ void ARGBToUVRow_MSA(const uint8* src_argb0, } } -void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2; v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; @@ -942,10 +942,10 @@ void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 21, 22, 24, 25, 26, 28, 29, 30}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); @@ -956,7 +956,7 @@ void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2; v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; @@ -966,10 +966,10 @@ void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 21, 20, 26, 25, 24, 30, 29, 28}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); @@ -980,15 +980,15 @@ void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, dst0; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); @@ -1014,15 +1014,17 @@ void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { int x; v16u8 src0, src1, dst0; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); @@ -1054,7 +1056,9 @@ void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { int x; v16u8 src0, src1; v16u8 vec0, vec1; @@ -1062,8 +1066,8 @@ void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); @@ -1077,11 +1081,11 @@ void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToUV444Row_MSA(const uint8* src_argb, - uint8* dst_u, - uint8* dst_v, - int32 width) { - int32 x; +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int32_t x; v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 vec8, vec9, vec10, vec11; @@ -1094,10 +1098,10 @@ void ARGBToUV444Row_MSA(const uint8* src_argb, v16i8 zero = {0}; for (x = width; x > 0; x -= 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); @@ -1149,9 +1153,9 @@ void ARGBToUV444Row_MSA(const uint8* src_argb, } } -void ARGBMultiplyRow_MSA(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, dst0; @@ -1160,8 +1164,8 @@ void ARGBMultiplyRow_MSA(const uint8* src_argb0, v8i16 zero = {0}; for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); @@ -1188,18 +1192,18 @@ void ARGBMultiplyRow_MSA(const uint8* src_argb0, } } -void ARGBAddRow_MSA(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); @@ -1209,18 +1213,18 @@ void ARGBAddRow_MSA(const uint8* src_argb0, } } -void ARGBSubtractRow_MSA(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_subs_u_b(src0, src2); dst1 = __msa_subs_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); @@ -1230,7 +1234,9 @@ void ARGBSubtractRow_MSA(const uint8* src_argb0, } } -void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { int x; v16u8 src0, src1, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; @@ -1239,8 +1245,8 @@ void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); @@ -1295,9 +1301,9 @@ void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, - uint8* dst_rgb, - uint32 dither4, +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, int width) { int x; v16u8 src0, src1, dst0, vec0, vec1; @@ -1310,8 +1316,8 @@ void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); @@ -1339,15 +1345,15 @@ void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, } } -void ARGBShuffleRow_MSA(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { int x; v16u8 src0, src1, dst0, dst1; v16i8 vec0; v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - int32 val = LW((int32*)shuffler); + int32_t val = LW((int32_t*)shuffler); vec0 = (v16i8)__msa_fill_w(val); shuffler_vec += vec0; @@ -1363,10 +1369,10 @@ void ARGBShuffleRow_MSA(const uint8* src_argb, } } -void ARGBShadeRow_MSA(const uint8* src_argb, - uint8* dst_argb, +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { + uint32_t value) { int x; v16u8 src0, dst0; v8u16 vec0, vec1; @@ -1402,7 +1408,7 @@ void ARGBShadeRow_MSA(const uint8* src_argb, } } -void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, dst0, dst1; v8u16 reg0; @@ -1427,7 +1433,7 @@ void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2; @@ -1468,8 +1474,8 @@ void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { } } -void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, - uint8* dst_argb, +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1; @@ -1497,8 +1503,8 @@ void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, } } -void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, - uint8* dst_argb, +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { int x; v8u16 src0, src1; @@ -1508,8 +1514,8 @@ void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16); + src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); @@ -1547,7 +1553,9 @@ void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, } } -void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) { +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -1558,8 +1566,8 @@ void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) { v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16); + src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16); vec0 = src0 & const_0x1F; vec1 = src0 & const_0x7E0; vec2 = src0 & const_0xF800; @@ -1592,7 +1600,9 @@ void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) { +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { int x; v16u8 src0, src1, src2; v16u8 vec0, vec1, vec2; @@ -1601,9 +1611,9 @@ void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) { v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32); + src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32); vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); @@ -1617,7 +1627,7 @@ void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) { +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2; v16u8 vec0, vec1, vec2; @@ -1626,9 +1636,9 @@ void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) { v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); + src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); @@ -1642,7 +1652,9 @@ void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) { } } -void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) { +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -1654,8 +1666,8 @@ void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16); + src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); @@ -1699,7 +1711,7 @@ void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) { } } -void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) { +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -1713,8 +1725,8 @@ void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) { v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16); + src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16); vec0 = src0 & const_0x1F; vec1 = src0 & const_0x7E0; vec2 = src0 & const_0xF800; @@ -1762,7 +1774,7 @@ void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) { } } -void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1777,9 +1789,9 @@ void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1803,7 +1815,7 @@ void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1818,9 +1830,9 @@ void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1844,14 +1856,14 @@ void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint16* s = (const uint16*)src_argb1555; - const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555); + const uint16_t* s = (const uint16_t*)src_argb1555; + const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); int64_t res0, res1; v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; @@ -1865,10 +1877,10 @@ void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); - src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); - src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); - src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + src0 = (v8u16)__msa_ld_b((void*)s, 0); + src1 = (v8u16)__msa_ld_b((void*)s, 16); + src2 = (v8u16)__msa_ld_b((void*)t, 0); + src3 = (v8u16)__msa_ld_b((void*)t, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; vec0 += src2 & const_0x1F; @@ -1925,14 +1937,14 @@ void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, } } -void RGB565ToUVRow_MSA(const uint8* src_rgb565, +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint16* s = (const uint16*)src_rgb565; - const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565); + const uint16_t* s = (const uint16_t*)src_rgb565; + const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); int64_t res0, res1; v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5; @@ -1947,10 +1959,10 @@ void RGB565ToUVRow_MSA(const uint8* src_rgb565, v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); - src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); - src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); - src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + src0 = (v8u16)__msa_ld_b((void*)s, 0); + src1 = (v8u16)__msa_ld_b((void*)s, 16); + src2 = (v8u16)__msa_ld_b((void*)t, 0); + src3 = (v8u16)__msa_ld_b((void*)t, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; vec0 += src2 & const_0x1F; @@ -2005,15 +2017,15 @@ void RGB565ToUVRow_MSA(const uint8* src_rgb565, } } -void RGB24ToUVRow_MSA(const uint8* src_rgb0, +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; - int64 res0, res1; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -2029,12 +2041,12 @@ void RGB24ToUVRow_MSA(const uint8* src_rgb0, v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); - inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); - inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); + inp0 = (v16u8)__msa_ld_b((void*)s, 0); + inp1 = (v16u8)__msa_ld_b((void*)s, 16); + inp2 = (v16u8)__msa_ld_b((void*)s, 32); + inp3 = (v16u8)__msa_ld_b((void*)t, 0); + inp4 = (v16u8)__msa_ld_b((void*)t, 16); + inp5 = (v16u8)__msa_ld_b((void*)t, 32); src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); @@ -2110,15 +2122,15 @@ void RGB24ToUVRow_MSA(const uint8* src_rgb0, } } -void RAWToUVRow_MSA(const uint8* src_rgb0, +void RAWToUVRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; - int64 res0, res1; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -2134,12 +2146,12 @@ void RAWToUVRow_MSA(const uint8* src_rgb0, v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); - inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); - inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); + inp0 = (v16u8)__msa_ld_b((void*)s, 0); + inp1 = (v16u8)__msa_ld_b((void*)s, 16); + inp2 = (v16u8)__msa_ld_b((void*)s, 32); + inp3 = (v16u8)__msa_ld_b((void*)t, 0); + inp4 = (v16u8)__msa_ld_b((void*)t, 16); + inp5 = (v16u8)__msa_ld_b((void*)t, 32); src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); @@ -2215,13 +2227,13 @@ void RAWToUVRow_MSA(const uint8* src_rgb0, } } -void NV12ToARGBRow_MSA(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; - uint64 val0, val1; + uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; @@ -2245,20 +2257,20 @@ void NV12ToARGBRow_MSA(const uint8* src_y, res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, rgb_buf, 16); + ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_uv += 8; - rgb_buf += 32; + dst_argb += 32; } } -void NV12ToRGB565Row_MSA(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { int x; - uint64 val0, val1; + uint64_t val0, val1; v16u8 src0, src1, dst0; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; @@ -2281,20 +2293,20 @@ void NV12ToRGB565Row_MSA(const uint8* src_y, vec1 = (vec1 >> 2) << 5; vec2 = (vec2 >> 3) << 11; dst0 = (v16u8)(vec0 | vec1 | vec2); - ST_UB(dst0, rgb_buf); + ST_UB(dst0, dst_rgb565); src_y += 8; src_uv += 8; - rgb_buf += 16; + dst_rgb565 += 16; } } -void NV21ToARGBRow_MSA(const uint8* src_y, - const uint8* src_vu, - uint8* rgb_buf, +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; - uint64 val0, val1; + uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; @@ -2320,16 +2332,16 @@ void NV21ToARGBRow_MSA(const uint8* src_y, res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, rgb_buf, 16); + ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_vu += 8; - rgb_buf += 32; + dst_argb += 32; } } -void SobelRow_MSA(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; @@ -2341,8 +2353,8 @@ void SobelRow_MSA(const uint8* src_sobelx, v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); vec0 = __msa_adds_u_b(src0, src1); dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); @@ -2355,18 +2367,18 @@ void SobelRow_MSA(const uint8* src_sobelx, } } -void SobelToPlaneRow_MSA(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_y, 16); @@ -2376,9 +2388,9 @@ void SobelToPlaneRow_MSA(const uint8* src_sobelx, } } -void SobelXYRow_MSA(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, vec2; @@ -2386,8 +2398,8 @@ void SobelXYRow_MSA(const uint8* src_sobelx, v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); vec0 = __msa_adds_u_b(src0, src1); vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); @@ -2404,7 +2416,7 @@ void SobelXYRow_MSA(const uint8* src_sobelx, } } -void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); @@ -2412,10 +2424,10 @@ void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, dst0); ST_UB(dst0, dst_y); @@ -2424,7 +2436,7 @@ void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); @@ -2432,10 +2444,10 @@ void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); @@ -2444,7 +2456,7 @@ void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); @@ -2452,10 +2464,10 @@ void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); @@ -2464,7 +2476,7 @@ void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); @@ -2472,10 +2484,10 @@ void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); @@ -2484,14 +2496,14 @@ void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { } } -void ARGBToUVJRow_MSA(const uint8* src_rgb0, +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3; v16u8 dst0, dst1; @@ -2506,14 +2518,14 @@ void ARGBToUVJRow_MSA(const uint8* src_rgb0, v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)s, 32); + src3 = (v16u8)__msa_ld_b((void*)s, 48); + src4 = (v16u8)__msa_ld_b((void*)t, 0); + src5 = (v16u8)__msa_ld_b((void*)t, 16); + src6 = (v16u8)__msa_ld_b((void*)t, 32); + src7 = (v16u8)__msa_ld_b((void*)t, 48); src0 = __msa_aver_u_b(src0, src4); src1 = __msa_aver_u_b(src1, src5); src2 = __msa_aver_u_b(src2, src6); @@ -2524,14 +2536,14 @@ void ARGBToUVJRow_MSA(const uint8* src_rgb0, src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); vec0 = __msa_aver_u_b(src4, src6); vec1 = __msa_aver_u_b(src5, src7); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); + src0 = (v16u8)__msa_ld_b((void*)s, 64); + src1 = (v16u8)__msa_ld_b((void*)s, 80); + src2 = (v16u8)__msa_ld_b((void*)s, 96); + src3 = (v16u8)__msa_ld_b((void*)s, 112); + src4 = (v16u8)__msa_ld_b((void*)t, 64); + src5 = (v16u8)__msa_ld_b((void*)t, 80); + src6 = (v16u8)__msa_ld_b((void*)t, 96); + src7 = (v16u8)__msa_ld_b((void*)t, 112); src0 = __msa_aver_u_b(src0, src4); src1 = __msa_aver_u_b(src1, src5); src2 = __msa_aver_u_b(src2, src6); @@ -2554,14 +2566,14 @@ void ARGBToUVJRow_MSA(const uint8* src_rgb0, } } -void BGRAToUVRow_MSA(const uint8* src_rgb0, +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; v16u8 dst0, dst1, vec0, vec1, vec2, vec3; v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, @@ -2587,14 +2599,14 @@ void BGRAToUVRow_MSA(const uint8* src_rgb0, } } -void ABGRToUVRow_MSA(const uint8* src_rgb0, +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1; v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; @@ -2621,14 +2633,14 @@ void ABGRToUVRow_MSA(const uint8* src_rgb0, } } -void RGBAToUVRow_MSA(const uint8* src_rgb0, +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { int x; - const uint8* s = src_rgb0; - const uint8* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; v16u8 dst0, dst1, vec0, vec1, vec2, vec3; v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, @@ -2654,10 +2666,10 @@ void RGBAToUVRow_MSA(const uint8* src_rgb0, } } -void I444ToARGBRow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; @@ -2714,15 +2726,15 @@ void I444ToARGBRow_MSA(const uint8* src_y, vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); - ST_UB2(dst0, dst1, rgb_buf, 16); + ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_u += 8; src_v += 8; - rgb_buf += 32; + dst_argb += 32; } } -void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) { +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { int x; v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; v8i16 vec0, vec1; @@ -2734,7 +2746,7 @@ void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) { v8i16 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); + src0 = (v16u8)__msa_ld_b((void*)src_y, 0); vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); reg0 = (v4i32)__msa_ilvr_h(zero, vec0); @@ -2768,19 +2780,19 @@ void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) { dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); - ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_y += 16; - rgb_buf += 64; + dst_argb += 64; } } -void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) { +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { int x; v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); + src0 = (v16u8)__msa_ld_b((void*)src_y, 0); vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); @@ -2795,8 +2807,8 @@ void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) { } } -void YUY2ToARGBRow_MSA(const uint8* src_yuy2, - uint8* rgb_buf, +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; @@ -2812,19 +2824,19 @@ void YUY2ToARGBRow_MSA(const uint8* src_yuy2, vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0); + src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_yuy2 += 16; - rgb_buf += 32; + dst_argb += 32; } } -void UYVYToARGBRow_MSA(const uint8* src_uyvy, - uint8* rgb_buf, +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; @@ -2840,27 +2852,27 @@ void UYVYToARGBRow_MSA(const uint8* src_uyvy, vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0); + src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_uyvy += 16; - rgb_buf += 32; + dst_argb += 32; } } -void InterpolateRow_MSA(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int width, - int32 source_y_fraction) { - int32 y1_fraction = source_y_fraction; - int32 y0_fraction = 256 - y1_fraction; - uint16 y_fractions; - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; + int32_t source_y_fraction) { + int32_t y1_fraction = source_y_fraction; + int32_t y0_fraction = 256 - y1_fraction; + uint16_t y_fractions; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; v16u8 src0, src1, src2, src3, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, y_frac; @@ -2872,10 +2884,10 @@ void InterpolateRow_MSA(uint8* dst_ptr, if (128 == y1_fraction) { for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); dst0 = __msa_aver_u_b(src0, src2); dst1 = __msa_aver_u_b(src1, src3); ST_UB2(dst0, dst1, dst_ptr, 16); @@ -2886,14 +2898,14 @@ void InterpolateRow_MSA(uint8* dst_ptr, return; } - y_fractions = (uint16)(y0_fraction + (y1_fraction << 8)); + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); y_frac = (v8u16)__msa_fill_h(y_fractions); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); @@ -2915,9 +2927,9 @@ void InterpolateRow_MSA(uint8* dst_ptr, } } -void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { int x; - v16u8 dst0 = (v16u8)__msa_fill_w(v32); + v4i32 dst0 = __builtin_msa_fill_w(v32); for (x = 0; x < width; x += 4) { ST_UB(dst0, dst_argb); @@ -2925,7 +2937,7 @@ void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { } } -void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; @@ -2935,9 +2947,9 @@ void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { 24, 23, 28, 27, 26, 31, 30, 29}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); + src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); @@ -2950,16 +2962,16 @@ void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { } } -void MergeUVRow_MSA(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { int x; v16u8 src0, src1, dst0, dst1; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0); + src0 = (v16u8)__msa_ld_b((void*)src_u, 0); + src1 = (v16u8)__msa_ld_b((void*)src_v, 0); dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); ST_UB2(dst0, dst1, dst_uv, 16); @@ -2969,6 +2981,529 @@ void MergeUVRow_MSA(const uint8* src_u, } } +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int i; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + + for (i = 0; i < width; i += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_a); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11, vec12, vec13; + v8u16 const_256 = (v8u16)__msa_ldi_h(256); + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); + vec8 = (v8u16)__msa_fill_h(vec0[3]); + vec9 = (v8u16)__msa_fill_h(vec0[7]); + vec10 = (v8u16)__msa_fill_h(vec1[3]); + vec11 = (v8u16)__msa_fill_h(vec1[7]); + vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec10 = (v8u16)__msa_fill_h(vec2[3]); + vec11 = (v8u16)__msa_fill_h(vec2[7]); + vec12 = (v8u16)__msa_fill_h(vec3[3]); + vec13 = (v8u16)__msa_fill_h(vec3[7]); + vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); + vec8 = const_256 - vec8; + vec9 = const_256 - vec9; + vec10 = const_256 - vec10; + vec11 = const_256 - vec11; + vec8 *= vec4; + vec9 *= vec5; + vec10 *= vec6; + vec11 *= vec7; + vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); + vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); + vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); + vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); + vec0 += vec8; + vec1 += vec9; + vec2 += vec10; + vec3 += vec11; + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, const_255, mask); + dst1 = __msa_bmnz_v(dst1, const_255, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v4i32 vec_scale = __msa_fill_w(scale); + v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); + v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); + v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48); + vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); + tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); + tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); + tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); + tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); + tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); + tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); + tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); + tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); + tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); + tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); + tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); + tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); + tmp0 *= vec_scale; + tmp1 *= vec_scale; + tmp2 *= vec_scale; + tmp3 *= vec_scale; + tmp4 *= vec_scale; + tmp5 *= vec_scale; + tmp6 *= vec_scale; + tmp7 *= vec_scale; + tmp8 *= vec_scale; + tmp9 *= vec_scale; + tmp10 *= vec_scale; + tmp11 *= vec_scale; + tmp12 *= vec_scale; + tmp13 *= vec_scale; + tmp14 *= vec_scale; + tmp15 *= vec_scale; + tmp0 >>= 16; + tmp1 >>= 16; + tmp2 >>= 16; + tmp3 >>= 16; + tmp4 >>= 16; + tmp5 >>= 16; + tmp6 >>= 16; + tmp7 >>= 16; + tmp8 >>= 16; + tmp9 >>= 16; + tmp10 >>= 16; + tmp11 >>= 16; + tmp12 >>= 16; + tmp13 >>= 16; + tmp14 >>= 16; + tmp15 >>= 16; + vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + dst0 *= vec_int_sz; + dst1 *= vec_int_sz; + dst2 *= vec_int_sz; + dst3 *= vec_int_sz; + dst0 += vec_int_ofst; + dst1 += vec_int_ofst; + dst2 += vec_int_ofst; + dst3 += vec_int_ofst; + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int32_t x; + v16i8 src0; + v16u8 src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + src0 = __msa_ld_b((void*)matrix_argb, 0); + vec0 = (v8i16)__msa_ilvr_b(zero, src0); + vec1 = (v8i16)__msa_ilvl_b(zero, src0); + + for (x = 0; x < width; x += 8) { + src1 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 16); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); + vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); + vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); + vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); + vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); + vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); + vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); + vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); + vec10 = vec2 * vec0; + vec11 = vec2 * vec1; + vec12 = vec6 * vec0; + vec13 = vec6 * vec1; + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + vec14 = vec3 * vec0; + vec15 = vec3 * vec1; + vec16 = vec7 * vec0; + vec17 = vec7 * vec1; + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + tmp0 = __msa_srai_w(tmp0, 6); + tmp1 = __msa_srai_w(tmp1, 6); + tmp2 = __msa_srai_w(tmp2, 6); + tmp3 = __msa_srai_w(tmp3, 6); + vec2 = vec4 * vec0; + vec6 = vec4 * vec1; + vec3 = vec8 * vec0; + vec7 = vec8 * vec1; + tmp8 = __msa_hadd_s_w(vec2, vec2); + tmp9 = __msa_hadd_s_w(vec6, vec6); + tmp10 = __msa_hadd_s_w(vec3, vec3); + tmp11 = __msa_hadd_s_w(vec7, vec7); + vec4 = vec5 * vec0; + vec8 = vec5 * vec1; + vec5 = vec9 * vec0; + vec9 = vec9 * vec1; + tmp12 = __msa_hadd_s_w(vec4, vec4); + tmp13 = __msa_hadd_s_w(vec8, vec8); + tmp14 = __msa_hadd_s_w(vec5, vec5); + tmp15 = __msa_hadd_s_w(vec9, vec9); + vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + tmp4 = __msa_srai_w(tmp4, 6); + tmp5 = __msa_srai_w(tmp5, 6); + tmp6 = __msa_srai_w(tmp6, 6); + tmp7 = __msa_srai_w(tmp7, 6); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec10 = __msa_maxi_s_h(vec10, 0); + vec11 = __msa_maxi_s_h(vec11, 0); + vec12 = __msa_maxi_s_h(vec12, 0); + vec13 = __msa_maxi_s_h(vec13, 0); + vec10 = __msa_min_s_h(vec10, max); + vec11 = __msa_min_s_h(vec11, max); + vec12 = __msa_min_s_h(vec12, max); + vec13 = __msa_min_s_h(vec13, max); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((void*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((void*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((void*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((void*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} + +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((void*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((void*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((void*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((void*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int32_t width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((void*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((void*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((void*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((void*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int32_t width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((void*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} + +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; + v4f32 mult_vec; + v8i16 zero = {0}; + mult_vec[0] = 1.9259299444e-34f * scale; + mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); + + for (i = 0; i < width; i += 32) { + src0 = (v8u16)__msa_ld_h((void*)src, 0); + src1 = (v8u16)__msa_ld_h((void*)src, 16); + src2 = (v8u16)__msa_ld_h((void*)src, 32); + src3 = (v8u16)__msa_ld_h((void*)src, 48); + vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); + vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); + vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); + vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); + vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); + vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); + vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); + vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); + fvec0 = __msa_ffint_u_w(vec0); + fvec1 = __msa_ffint_u_w(vec1); + fvec2 = __msa_ffint_u_w(vec2); + fvec3 = __msa_ffint_u_w(vec3); + fvec4 = __msa_ffint_u_w(vec4); + fvec5 = __msa_ffint_u_w(vec5); + fvec6 = __msa_ffint_u_w(vec6); + fvec7 = __msa_ffint_u_w(vec7); + fvec0 *= mult_vec; + fvec1 *= mult_vec; + fvec2 *= mult_vec; + fvec3 *= mult_vec; + fvec4 *= mult_vec; + fvec5 *= mult_vec; + fvec6 *= mult_vec; + fvec7 *= mult_vec; + vec0 = ((v4u32)fvec0) >> 13; + vec1 = ((v4u32)fvec1) >> 13; + vec2 = ((v4u32)fvec2) >> 13; + vec3 = ((v4u32)fvec3) >> 13; + vec4 = ((v4u32)fvec4) >> 13; + vec5 = ((v4u32)fvec5) >> 13; + vec6 = ((v4u32)fvec6) >> 13; + vec7 = ((v4u32)fvec7) >> 13; + dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); + dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + ST_UH2(dst0, dst1, dst, 8); + ST_UH2(dst2, dst3, dst + 16, 8); + src += 32; + dst += 32; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc index bed14e07..a12fa790 100644 --- a/files/source/row_neon.cc +++ b/files/source/row_neon.cc @@ -22,54 +22,42 @@ extern "C" { !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.32 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.32 {d2[1]}, [%2]! \n" +#define READYUV422 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" +#define READYUV444 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READNV12 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" +#define READNV21 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 YUY2 #define READYUY2 \ - MEMACCESS(0) \ "vld2.8 {d0, d2}, [%0]! \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ @@ -77,26 +65,19 @@ extern "C" { // Read 8 UYVY #define READUYVY \ - MEMACCESS(0) \ "vld2.8 {d2, d3}, [%0]! \n" \ "vmov.u8 d0, d3 \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ "vtrn.u32 d2, d3 \n" -#define YUVTORGB_SETUP \ - MEMACCESS([kUVToRB]) \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ - MEMACCESS([kUVToG]) \ - "vld1.8 {d25}, [%[kUVToG]] \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ - MEMACCESS([kYToRgb]) \ - "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" +#define YUVTORGB_SETUP \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" #define YUVTORGB \ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ @@ -125,156 +106,135 @@ extern "C" { "vqshrun.s16 d22, q9, #6 \n" /* R */ \ "vqshrun.s16 d21, q0, #6 \n" /* G */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV444 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV444 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %5, %5, #8 \n" - MEMACCESS(3) - "vld1.8 {d23}, [%3]! \n" - MEMACCESS(4) - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB - MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } #define ARGBTORGB565 \ @@ -284,34 +244,29 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "vsri.16 q0, q8, #5 \n" /* RG */ \ "vsri.16 q0, q9, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } #define ARGBTOARGB1555 \ @@ -323,35 +278,30 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "vsri.16 q0, q9, #6 \n" /* ARG */ \ "vsri.16 q0, q10, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } #define ARGBTOARGB4444 \ @@ -363,447 +313,488 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "vorr d1, d22, d23 \n" /* RA */ \ "vzip.u8 d0, d1 \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB4444 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV400 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d23, #255 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d20", "d21", "d22", "d23" - ); -} - -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "1: \n" + + READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV400 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), + [kUVToG] "r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23"); +} + +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV21 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUY2 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READUYVY - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store U - MEMACCESS(2) - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load U - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "d0", "d1", "d2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "q0" - ); -} - -// ARGBSetRow writes 'count' pixels using an 32 bit value repeated. -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "q0" - ); -} - -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2 \n" - "sub %0, #16 \n" - - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); -} - -void MirrorUVRow_NEON(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0"); +} + +// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0"); +} + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" - - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // dst += 8 - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0" - ); -} - -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" - - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); -} - -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3" // Clobber List + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List ); } @@ -819,22 +810,22 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { "vorr.u8 d2, d1, d5 \n" /* R */ \ "vorr.u8 d1, d4, d6 \n" /* G */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } @@ -865,24 +856,22 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { "vorr.u8 d2, d1, d5 \n" /* R */ \ "vorr.u8 d1, d4, d6 \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, - uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } @@ -896,500 +885,447 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, - uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8* src_yuy2, + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8* src_uyvy, + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + asm volatile( + "add %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, + asm volatile( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); -} - -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); -} - -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); -} - -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, - uint8* dst_rgb, - const uint32 dither4, + asm volatile( + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, int width) { - asm volatile ( - "vdup.32 d2, %2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" - ARGBTORGB565 - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11" - ); -} - -void ARGBToARGB1555Row_NEON(const uint8* src_argb, - uint8* dst_argb1555, + asm volatile( + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); -} - -void ARGBToARGB4444Row_NEON(const uint8* src_argb, - uint8* dst_argb4444, + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); -} - -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); -} - -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); + asm volatile( + "vmov.u8 d4, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, - uint8* dst_u, - uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" - ); -} - + asm volatile( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 + // coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", + "q15"); +} + +// clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB \ - ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG \ - ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR \ - ", q12 \n" /* R */ \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR \ - ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG \ - ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB \ - ", q13 \n" /* B */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8* src_argb, +void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb @@ -1399,17 +1335,13 @@ void ARGBToUVRow_NEON(const uint8* src_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1421,9 +1353,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1438,10 +1368,10 @@ void ARGBToUVRow_NEON(const uint8* src_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, +void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb @@ -1451,17 +1381,13 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1473,9 +1399,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1489,10 +1413,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, +void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra @@ -1502,17 +1426,13 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1524,9 +1444,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1540,10 +1458,10 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, +void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr @@ -1553,17 +1471,13 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1575,9 +1489,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1591,10 +1503,10 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, +void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba @@ -1604,17 +1516,13 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1626,9 +1534,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1642,10 +1548,10 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 @@ -1655,17 +1561,13 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1677,9 +1579,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1693,10 +1593,10 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, +void RAWToUVRow_NEON(const uint8_t* src_raw, int src_stride_raw, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw @@ -1706,17 +1606,13 @@ void RAWToUVRow_NEON(const uint8* src_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1728,9 +1624,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1745,901 +1639,815 @@ void RAWToUVRow_NEON(const uint8* src_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); -} - -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); -} - -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); -} - -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); -} - -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); -} - -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); -} - -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); -} - -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction) // %4 - : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" - ); + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - "subs %3, #8 \n" - "blt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" - - "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" - - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" - ); + asm volatile( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { - asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add - - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - MEMACCESS(0) - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" - ); + asm volatile( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, - uint8* dst_argb, +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { - asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - MEMACCESS(1) - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13" - ); + uint32_t value) { + asm volatile( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q10", "q11", "q12", "q13", "q14", "q15" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", + "q14", "q15"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, - uint8* dst_argb, - const int8* matrix_argb, +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - MEMACCESS(0) - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2647,58 +2455,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); + asm volatile( + // 16 pixel loop. + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2706,75 +2506,64 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, - const uint8* src_y1, - const uint8* src_y2, - uint8* dst_sobelx, +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top - MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom - MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List ); } @@ -2782,99 +2571,317 @@ void SobelXRow_NEON(const uint8* src_y0, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, - const uint8* src_y1, - uint8* dst_sobely, +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left - MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right - MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { - asm volatile ( - "vdup.32 q0, %3 \n" - - "1: \n" - MEMACCESS(0) - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, q0 \n" // adjust exponent - "vmul.f32 q3, q3, q0 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(1.9259299444e-34f) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" - ); -} - -// TODO(fbarchard): multiply by element. -void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "vdup.32 q0, %3 \n" - - "1: \n" - MEMACCESS(0) - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, q0 \n" // adjust exponent - "vmul.f32 q3, q3, q0 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" - ); + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "vmov.u16 d6, #4 \n" // constant 4 + "vmov.u16 d7, #6 \n" // constant 6 + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows + "vld1.16 {q2}, [%4]! \n" + "vaddl.u16 q0, d2, d4 \n" // * 1 + "vaddl.u16 q1, d3, d5 \n" // * 1 + "vld1.16 {q2}, [%1]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "vld1.16 {q2}, [%2]! \n" + "vmlal.u16 q0, d4, d7 \n" // * 6 + "vmlal.u16 q1, d5, d7 \n" // * 6 + "vld1.16 {q2}, [%3]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "subs %6, %6, #8 \n" // 8 processed per loop + "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples + "bgt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "vmov.u32 q10, #4 \n" // constant 4 + "vmov.u32 q11, #6 \n" // constant 6 + + "1: \n" + "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples + "vld1.32 {q2}, [%0] \n" + "vadd.u32 q0, q0, q1 \n" // * 1 + "vadd.u32 q1, q1, q2 \n" // * 1 + "vld1.32 {q2, q3}, [%2]! \n" + "vmla.u32 q0, q2, q11 \n" // * 6 + "vmla.u32 q1, q3, q11 \n" // * 6 + "vld1.32 {q2, q3}, [%1]! \n" + "vld1.32 {q8, q9}, [%3]! \n" + "vadd.u32 q2, q2, q8 \n" // add rows for * 4 + "vadd.u32 q3, q3, q9 \n" + "vmla.u32 q0, q2, q10 \n" // * 4 + "vmla.u32 q1, q3, q10 \n" // * 4 + "subs %5, %5, #8 \n" // 8 processed per loop + "vqshrn.u32 d0, q0, #8 \n" // round and pack + "vqshrn.u32 d1, q1, #8 \n" + "vst1.u16 {q0}, [%4]! \n" // store 8 samples + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); +} + +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d0, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d1, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +} + +// Copy row of AYUV Y's into Y. +// Similar to ARGBExtractAlphaRow_NEON +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Convert biplanar UV channel of NV12 to NV21 +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc index ebd685e4..f5cbb470 100644 --- a/files/source/row_neon64.cc +++ b/files/source/row_neon64.cc @@ -19,54 +19,42 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.s}[0], [%1], #4 \n" \ - MEMACCESS(2) \ - "ld1 {v1.s}[1], [%2], #4 \n" +#define READYUV422 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + "ld1 {v1.s}[1], [%2], #4 \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.d}[0], [%1], #8 \n" \ - MEMACCESS(2) \ - "ld1 {v1.d}[1], [%2], #8 \n" \ - "uaddlp v1.8h, v1.16b \n" \ - "rshrn v1.8b, v1.8h, #1 \n" +#define READYUV444 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - MEMACCESS(0) \ "ld1 {v0.8b}, [%0], #8 \n" \ "movi v1.8b , #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV12 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v3.8b, v2.8b, v2.8b \n" \ - "uzp2 v1.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV21 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 YUY2 #define READYUY2 \ - MEMACCESS(0) \ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ "uzp2 v3.8b, v1.8b, v1.8b \n" \ "uzp1 v1.8b, v1.8b, v1.8b \n" \ @@ -74,7 +62,6 @@ extern "C" { // Read 8 UYVY #define READUYVY \ - MEMACCESS(0) \ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ "orr v0.8b, v3.8b, v3.8b \n" \ "uzp1 v1.8b, v2.8b, v2.8b \n" \ @@ -125,10 +112,10 @@ extern "C" { ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -138,7 +125,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, READYUV444 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -155,10 +141,10 @@ void I444ToARGBRow_NEON(const uint8* src_y, ); } -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -168,7 +154,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -185,11 +170,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, ); } -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -197,10 +182,8 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - MEMACCESS(3) "ld1 {v23.8b}, [%3], #8 \n" "subs %w5, %w5, #8 \n" - MEMACCESS(4) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -218,10 +201,10 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ); } -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -231,7 +214,6 @@ void I422ToRGBARow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -248,10 +230,10 @@ void I422ToRGBARow_NEON(const uint8* src_y, ); } -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -260,7 +242,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -284,34 +265,31 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "sri v0.8h, v21.8h, #5 \n" /* RG */ \ "sri v0.8h, v20.8h, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } #define ARGBTOARGB1555 \ @@ -323,35 +301,32 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ "sri v0.8h, v20.8h, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } #define ARGBTOARGB4444 \ @@ -364,10 +339,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -379,7 +354,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 - MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -396,7 +370,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ); } -void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -404,7 +378,6 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { READYUV400 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -419,29 +392,26 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { ); } -void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - "movi v23.8b, #255 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v20.8b}, [%0], #8 \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - MEMACCESS(1) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v20", "v21", "v22", "v23" - ); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v23.8b, #255 \n" + "1: \n" + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23"); } -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -451,7 +421,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y, READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -467,9 +436,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ); } -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -479,7 +448,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y, READNV21 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -495,24 +463,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ); } -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP "1: \n" READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 + "+r"(dst_rgb24), // %2 "+r"(width) // %3 : [kUVToRB]"r"(&yuvconstants->kUVToRB), [kUVToG]"r"(&yuvconstants->kUVToG), @@ -523,8 +489,59 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ); } -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READNV21 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB( + v22, v21, + v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -534,7 +551,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, READYUY2 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -549,8 +565,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ); } -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -560,7 +576,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, READUYVY YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -576,231 +591,250 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store U - MEMACCESS(2) - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load U - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 - "subs %w2, %w2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "v0" - ); -} - -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "v0" - ); -} - -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); -} - -void MirrorUVRow_NEON(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1" - ); -} - -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); -} - -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "movi v4.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "movi v5.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + asm volatile( + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +// Copy multiple of 32. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0"); +} + +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0"); +} + +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %w3, %w3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } @@ -817,22 +851,22 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ "dup v2.2D, v0.D[1] \n" /* R */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List ); } @@ -873,24 +907,23 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ "dup v1.2D, v0.D[1] \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, - uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } @@ -906,477 +939,429 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, "dup v0.2D, v2.D[1] \n" \ "dup v1.2D, v3.D[1] \n" -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, - uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - MEMACCESS(1) - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8* src_yuy2, + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8* src_uyvy, + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(src_uyvyb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List + const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "orr v2.8b, v1.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); -} - -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, + asm volatile( + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); -} - -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); -} - -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, - uint8* dst_rgb, - const uint32 dither4, + asm volatile( + "1: \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); +} + +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, int width) { - asm volatile ( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" - ARGBTORGB565 - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" - ); -} - -void ARGBToARGB1555Row_NEON(const uint8* src_argb, - uint8* dst_argb1555, + asm volatile( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); +} + +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); -} - -void ARGBToARGB4444Row_NEON(const uint8* src_argb, - uint8* dst_argb4444, + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); +} + +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" - ); -} - -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); -} - -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #15 \n" // B * 0.11400 coefficient - "movi v5.8b, #75 \n" // G * 0.58700 coefficient - "movi v6.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); + asm volatile( + "movi v4.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, - uint8* dst_u, - uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v24", "v25", "v26", "v27", "v28", "v29" - ); + asm volatile( + "movi v24.8b, #112 \n" // UB / VR 0.875 + // coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + // pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", + "v27", "v28", "v29"); } #define RGBTOUV_SETUP_REG \ @@ -1388,43 +1373,37 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// clang-format off #define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB \ - ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR \ - ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG \ - ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG \ - ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR \ - ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB \ - ",v23.8h \n" /* B */ \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVRow_NEON(const uint8* src_argb, +void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1436,9 +1415,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1453,12 +1430,12 @@ void ARGBToUVRow_NEON(const uint8* src_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, +void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 @@ -1467,12 +1444,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1484,9 +1459,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1500,21 +1473,19 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, +void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_bgra_1 = src_bgra + src_stride_bgra; + const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1526,9 +1497,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_bgra), // %0 @@ -1542,21 +1511,19 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, +void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_abgr_1 = src_abgr + src_stride_abgr; + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1568,9 +1535,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_abgr), // %0 @@ -1584,21 +1549,19 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, +void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_rgba_1 = src_rgba + src_stride_rgba; + const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1610,9 +1573,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgba), // %0 @@ -1626,21 +1587,19 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1652,9 +1611,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb24), // %0 @@ -1668,21 +1625,19 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, +void RAWToUVRow_NEON(const uint8_t* src_raw, int src_stride_raw, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_raw_1 = src_raw + src_stride_raw; + const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1694,9 +1649,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1711,717 +1664,656 @@ void RAWToUVRow_NEON(const uint8* src_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile ( - "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 - "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 - "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 - "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 - "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v17.D[0] \n" - "ins v18.D[1], v19.D[0] \n" - "ins v20.D[1], v21.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v18.8h, #1 \n" - "urshr v6.8h, v20.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v16.8h, v4.8h, v22.8h \n" // B - "mls v16.8h, v5.8h, v23.8h \n" // G - "mls v16.8h, v6.8h, v24.8h \n" // R - "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned - "mul v17.8h, v6.8h, v22.8h \n" // R - "mls v17.8h, v5.8h, v26.8h \n" // G - "mls v17.8h, v4.8h, v25.8h \n" // B - "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27" - ); + const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / + // 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_argb1555_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" - ); + const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_argb4444_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" - - ); -} - -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", - "v24", "v25", "v26", "v27" - ); -} - -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); -} - -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" - ); -} - -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); -} - -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); -} - -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); -} - -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); -} - -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); + const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28" + + ); +} + +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", + "v27"); +} + +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction), // %4 - "+r"(y0_fraction) // %5 - : - : "cc", "memory", "v0", "v1", "v3", "v4", "v5" - ); + const uint8_t* src_ptr1 = src_ptr + src_stride; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.ge 8b \n" - - "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" - - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" - - "99: \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18" - ); + asm volatile( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + // pixels + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + // pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" + + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" + + // Blend 1 pixels. + "1: \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { - asm volatile ( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add - - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); + asm volatile( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, - uint8* dst_argb, +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { - asm volatile ( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - MEMACCESS(1) - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "v0", "v4", "v5", "v6", "v7" - ); + uint32_t value) { + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + "1: \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movi v24.8b, #15 \n" // B * 0.11400 coefficient - "movi v25.8b, #75 \n" // G * 0.58700 coefficient - "movi v26.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. @@ -2429,202 +2321,180 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, - uint8* dst_argb, - const int8* matrix_argb, +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - MEMACCESS(0) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v22", "v23", "v24", "v25" - ); + asm volatile( + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v22", "v23", "v24", "v25"); } // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2632,58 +2502,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1" - ); + asm volatile( + // 16 pixel loop. + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2691,75 +2553,64 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, - const uint8* src_y1, - const uint8* src_y2, - uint8* dst_sobelx, +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%5 \n" // top - MEMACCESS(0) - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(2) - "ld1 {v2.8b}, [%2],%5 \n" // bottom - MEMACCESS(2) - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(3) - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2LL), // %5 - "r"(6LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } @@ -2767,93 +2618,414 @@ void SobelXRow_NEON(const uint8* src_y0, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, - const uint8* src_y1, - uint8* dst_sobely, +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%4 \n" // left - MEMACCESS(1) - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%5 \n" // right - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1LL), // %4 - "r"(6LL) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } // Caveat - rounds float to half float whereas scaling version truncates. -void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3" - ); -} - -void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v3" - ); +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fsum; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void FloatDivToByteRow_NEON(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width) { + asm volatile( + "movi v0.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights + "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values + "subs %w4, %w4, #8 \n" // 8 pixels per loop + + "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights + "fdiv v2.4s, v4.4s, v2.4s \n" + + "fcvtas v1.4s, v1.4s \n" // float to int + "fcvtas v2.4s, v2.4s \n" // float to int + "uqxtn v1.4h, v1.4s \n" // 8 shorts + "uqxtn2 v1.8h, v2.4s \n" + "uqxtn v1.8b, v1.8h \n" // 8 bytes + + "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out + + "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero + "fcmgt v6.4s, v2.4s, v0.4s \n" + "uqxtn v5.4h, v5.4s \n" // 8 shorts + "uqxtn2 v5.8h, v6.4s \n" + "uqxtn v5.8b, v1.8h \n" // 8 bytes + + "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask + + "b.gt 1b \n" + : "+r"(src_weights), // %0 + "+r"(src_values), // %1 + "+r"(dst_out), // %2 + "+r"(dst_mask), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); +} + +// Convert biplanar UV channel of NV12 to NV21 +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values + "orr v2.16b, v0.16b, v0.16b \n" // move U after V + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/files/source/row_win.cc b/files/source/row_win.cc index 202f2b8d..27e3da7b 100644 --- a/files/source/row_win.cc +++ b/files/source/row_win.cc @@ -28,27 +28,27 @@ extern "C" { #if defined(_M_X64) // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ y_buf += 8; // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. @@ -84,15 +84,15 @@ extern "C" { dst_argb += 32; #if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUV422 YUVTORGB(yuvconstants) @@ -103,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, #endif #if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4, xmm5; - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 YUVTORGB(yuvconstants) @@ -255,8 +255,8 @@ static const lvec8 kShuffleNV21 = { }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y, - uint8* dst_argb, +__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y @@ -285,8 +285,8 @@ __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y, #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y, - uint8* dst_argb, +__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y @@ -316,8 +316,8 @@ __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y, } #endif // HAS_J400TOARGBROW_AVX2 -__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, - uint8* dst_argb, +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_rgb24 @@ -355,8 +355,8 @@ __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, } } -__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, - uint8* dst_argb, +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_raw @@ -394,8 +394,8 @@ __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, } } -__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw, - uint8* dst_rgb24, +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, int width) { __asm { mov eax, [esp + 4] // src_raw @@ -430,8 +430,8 @@ __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw, // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, - uint8* dst_argb, +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits @@ -486,8 +486,8 @@ __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, - uint8* dst_argb, +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits @@ -537,8 +537,8 @@ __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, - uint8* dst_argb, +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits @@ -589,8 +589,8 @@ __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, - uint8* dst_argb, +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f @@ -627,8 +627,8 @@ __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions -__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, - uint8* dst_argb, +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits @@ -680,8 +680,8 @@ __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, } // 18 instructions. -__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, - uint8* dst_argb, +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f @@ -718,8 +718,8 @@ __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, } } -__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -757,8 +757,8 @@ __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, } } -__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -796,8 +796,8 @@ __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, } } -__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -834,9 +834,9 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, } } -__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, - uint8* dst_rgb, - const uint32 dither4, +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, int width) { __asm { @@ -881,9 +881,9 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, - uint8* dst_rgb, - const uint32 dither4, +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -925,8 +925,8 @@ __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -967,8 +967,8 @@ __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, } } -__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -998,8 +998,8 @@ __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -1036,8 +1036,8 @@ __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -1077,8 +1077,8 @@ __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, - uint8* dst_rgb, +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -1109,8 +1109,8 @@ __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1145,8 +1145,8 @@ __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1185,8 +1185,8 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1225,8 +1225,8 @@ __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1265,8 +1265,8 @@ __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1299,8 +1299,8 @@ __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, } } -__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1333,8 +1333,8 @@ __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, } } -__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, - uint8* dst_y, +__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1367,10 +1367,10 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, } } -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1439,10 +1439,10 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, } } -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, psraw xmm1, 8 packsswb xmm0, xmm1 - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1513,10 +1513,10 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned - // step 3 - store 16 U and 16 V values + // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] @@ -1581,10 +1581,10 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - // step 3 - store 16 U and 16 V values + // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] @@ -1649,9 +1649,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -1707,10 +1707,10 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1779,10 +1779,10 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, } } -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1851,10 +1851,10 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, } } -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, int src_stride_argb, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -2065,10 +2065,10 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void I422ToARGBRow_AVX2( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2105,11 +2105,11 @@ __declspec(naked) void I422ToARGBRow_AVX2( // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. __declspec(naked) void I422AlphaToARGBRow_AVX2( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2148,10 +2148,10 @@ __declspec(naked) void I422AlphaToARGBRow_AVX2( // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void I444ToARGBRow_AVX2( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2187,9 +2187,9 @@ __declspec(naked) void I444ToARGBRow_AVX2( // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void NV12ToARGBRow_AVX2( - const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2222,9 +2222,9 @@ __declspec(naked) void NV12ToARGBRow_AVX2( // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void NV21ToARGBRow_AVX2( - const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2257,8 +2257,8 @@ __declspec(naked) void NV21ToARGBRow_AVX2( // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) void YUY2ToARGBRow_AVX2( - const uint8* src_yuy2, - uint8* dst_argb, + const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2288,8 +2288,8 @@ __declspec(naked) void YUY2ToARGBRow_AVX2( // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) void UYVYToARGBRow_AVX2( - const uint8* src_uyvy, - uint8* dst_argb, + const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2319,10 +2319,10 @@ __declspec(naked) void UYVYToARGBRow_AVX2( // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). __declspec(naked) void I422ToRGBARow_AVX2( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2551,10 +2551,10 @@ __declspec(naked) void I422ToRGBARow_AVX2( // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void I444ToARGBRow_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2588,10 +2588,10 @@ __declspec(naked) void I444ToARGBRow_SSSE3( // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). __declspec(naked) void I422ToRGB24Row_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2626,10 +2626,10 @@ __declspec(naked) void I422ToRGB24Row_SSSE3( // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). __declspec(naked) void I422ToRGB565Row_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb565_buf, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb565_buf, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2669,10 +2669,10 @@ __declspec(naked) void I422ToRGB565Row_SSSE3( // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void I422ToARGBRow_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2706,11 +2706,11 @@ __declspec(naked) void I422ToARGBRow_SSSE3( // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. __declspec(naked) void I422AlphaToARGBRow_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2746,9 +2746,9 @@ __declspec(naked) void I422AlphaToARGBRow_SSSE3( // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void NV12ToARGBRow_SSSE3( - const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2778,9 +2778,9 @@ __declspec(naked) void NV12ToARGBRow_SSSE3( // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void NV21ToARGBRow_SSSE3( - const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2810,8 +2810,8 @@ __declspec(naked) void NV21ToARGBRow_SSSE3( // 8 pixels. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) void YUY2ToARGBRow_SSSE3( - const uint8* src_yuy2, - uint8* dst_argb, + const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2838,8 +2838,8 @@ __declspec(naked) void YUY2ToARGBRow_SSSE3( // 8 pixels. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) void UYVYToARGBRow_SSSE3( - const uint8* src_uyvy, - uint8* dst_argb, + const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2864,10 +2864,10 @@ __declspec(naked) void UYVYToARGBRow_SSSE3( } __declspec(naked) void I422ToRGBARow_SSSE3( - const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { __asm { @@ -2900,8 +2900,8 @@ __declspec(naked) void I422ToRGBARow_SSSE3( #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, +__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* rgb_buf, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, psrlw xmm0, 6 packuswb xmm0, xmm0 // G - // Step 2: Weave into ARGB + // Step 2: Weave into ARGB punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 // BGRA first 4 pixels @@ -2947,8 +2947,8 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, - uint8* rgb_buf, +__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* rgb_buf, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, vpsrlw ymm0, ymm0, 6 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels @@ -3000,8 +3000,8 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) void MirrorRow_SSSE3(const uint8* src, - uint8* dst, +__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3022,7 +3022,9 @@ __declspec(naked) void MirrorRow_SSSE3(const uint8* src, #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst @@ -3048,9 +3050,9 @@ __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3079,8 +3081,8 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3105,8 +3107,8 @@ __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3127,9 +3129,9 @@ __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3167,9 +3169,9 @@ __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3207,9 +3209,9 @@ __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { __asm { push edi @@ -3239,9 +3241,9 @@ __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, - const uint8* src_v, - uint8* dst_uv, +__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { __asm { push edi @@ -3273,12 +3275,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) void CopyRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width test eax, 15 jne convertloopu test edx, 15 @@ -3310,12 +3314,14 @@ __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) void CopyRow_AVX(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] @@ -3334,13 +3340,15 @@ __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_ERMS(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, esi mov edx, edi mov esi, [esp + 4] // src mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width rep movsb mov edi, edx mov esi, eax @@ -3350,13 +3358,13 @@ __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff @@ -3387,13 +3395,13 @@ __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff @@ -3417,8 +3425,8 @@ __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, - uint8* dst_a, +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -3445,8 +3453,8 @@ __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 // width in pixels -__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, - uint8* dst_a, +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -3481,13 +3489,13 @@ __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff @@ -3520,13 +3528,13 @@ __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, - uint8* dst, +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff @@ -3551,16 +3559,16 @@ __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -// Write 'count' bytes using an 8 bit value repeated. -// Count should be multiple of 4. -__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +// width should be multiple of 4. +__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { __asm { movzx eax, byte ptr [esp + 8] // v8 mov edx, 0x01010101 // Duplicate byte to all bytes. mul edx // overwrites edx with upper part of result. mov edx, edi mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width shr ecx, 2 rep stosd mov edi, edx @@ -3568,26 +3576,28 @@ __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { } } -// Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { __asm { mov edx, edi mov edi, [esp + 4] // dst mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width rep stosb mov edi, edx ret } } -// Write 'count' 32 bit values. -__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { +// Write 'width' 32 bit values. +__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, + uint32_t v32, + int width) { __asm { mov edx, edi mov edi, [esp + 4] // dst mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // count + mov ecx, [esp + 12] // width rep stosd mov edi, edx ret @@ -3596,8 +3606,8 @@ __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, - uint8* dst_y, +__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 @@ -3623,10 +3633,10 @@ __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, } } -__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -3669,9 +3679,9 @@ __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, } } -__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3709,8 +3719,8 @@ __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, - uint8* dst_y, +__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy @@ -3734,10 +3744,10 @@ __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, +__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -3780,9 +3790,9 @@ __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3822,8 +3832,8 @@ __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, - uint8* dst_y, +__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 @@ -3847,10 +3857,10 @@ __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -3892,9 +3902,9 @@ __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -3929,8 +3939,8 @@ __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, - uint8* dst_y, +__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy @@ -3952,10 +3962,10 @@ __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, +__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, int stride_uyvy, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push esi @@ -3997,9 +4007,9 @@ __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, - uint8* dst_v, +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, int width) { __asm { push edi @@ -4041,10 +4051,10 @@ __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, - const uint8* src1, - const uint8* alpha, - uint8* dst, +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, int width) { __asm { push esi @@ -4067,7 +4077,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, sub edx, esi sub edi, esi - // 8 pixel loop. + // 8 pixel loop. convertloop8: movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 @@ -4098,10 +4108,10 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, - const uint8* src1, - const uint8* alpha, - uint8* dst, +__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, int width) { __asm { push esi @@ -4123,7 +4133,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, sub edx, esi sub edi, esi - // 32 pixel loop. + // 32 pixel loop. convertloop32: vmovdqu ymm0, [esi] // alpha vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 @@ -4162,9 +4172,9 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4183,7 +4193,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, sub ecx, 4 jl convertloop4b // less than 4 pixels? - // 4 pixel loop. + // 4 pixel loop. convertloop4: movdqu xmm3, [eax] // src argb lea eax, [eax + 16] @@ -4212,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, add ecx, 4 - 1 jl convertloop1b - // 1 pixel loop. + // 1 pixel loop. convertloop1: movd xmm3, [eax] // src argb lea eax, [eax + 4] @@ -4253,8 +4263,8 @@ static const uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4298,8 +4308,8 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; -__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4336,8 +4346,8 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { push ebx @@ -4392,8 +4402,8 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = { // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4426,8 +4436,8 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, } } #else // USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { @@ -4495,8 +4505,8 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -4552,7 +4562,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ mov ecx, [esp + 8] /* width */ @@ -4608,9 +4618,9 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, - const int8* matrix_argb, +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -4670,7 +4680,7 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, @@ -4717,10 +4727,10 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - uint32 value) { + uint32_t value) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb @@ -4752,9 +4762,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4792,9 +4802,9 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4841,9 +4851,9 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4871,9 +4881,9 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4909,9 +4919,9 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4939,9 +4949,9 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, - const uint8* src_argb1, - uint8* dst_argb, +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, int width) { __asm { push esi @@ -4972,10 +4982,10 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, - const uint8* src_y1, - const uint8* src_y2, - uint8* dst_sobelx, +__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, int width) { __asm { push esi @@ -5030,9 +5040,9 @@ __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, - const uint8* src_y1, - uint8* dst_sobely, +__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, int width) { __asm { push esi @@ -5084,9 +5094,9 @@ __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { __asm { push esi @@ -5132,9 +5142,9 @@ __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_y, +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width) { __asm { push esi @@ -5166,9 +5176,9 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst_argb, +__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, int width) { __asm { push esi @@ -5225,11 +5235,11 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, // count is number of averaged pixels to produce. // Does 4 pixels at a time. // This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, - const int32* botleft, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, int width, int area, - uint8* dst, + uint8_t* dst, int count) { __asm { mov eax, topleft // eax topleft @@ -5256,7 +5266,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, cvtps2dq xmm5, xmm5 // 0.16 fixed point packssdw xmm5, xmm5 // 16 bit shorts - // 4 pixel loop small blocks. + // 4 pixel loop small blocks. s4: // top left movdqu xmm0, [eax] @@ -5298,7 +5308,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, jmp l4b - // 4 pixel loop + // 4 pixel loop l4: // top left movdqu xmm0, [eax] @@ -5350,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5375,9 +5385,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, - int32* cumsum, - const int32* previous_cumsum, +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, int width) { __asm { mov eax, row @@ -5392,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5438,9 +5448,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: - movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes. lea eax, [eax + 4] punpcklbw xmm2, xmm1 punpcklwd xmm2, xmm1 @@ -5460,9 +5470,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, - uint8* dst_argb, + uint8_t* dst_argb, const float* uv_dudv, int width) { __asm { @@ -5481,7 +5491,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 @@ -5493,7 +5503,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, addps xmm3, xmm4 addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 @@ -5524,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: cvttps2dq xmm0, xmm2 // x, y float to int packssdw xmm0, xmm0 // x, y as shorts @@ -5546,8 +5556,8 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, - const uint8* src_ptr, +__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5598,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] @@ -5608,7 +5618,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5623,8 +5633,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, // Bilinear filter 16x2 -> 16x1 // TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, +__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5638,7 +5648,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 @@ -5678,7 +5688,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -5689,7 +5699,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] movdqu [esi + edi], xmm0 @@ -5705,9 +5715,9 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -5732,9 +5742,9 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -5761,133 +5771,16 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, - uint8* dst_argb, - const uint8* shuffler, - int width) { - __asm { - push ebx - push esi - mov eax, [esp + 8 + 4] // src_argb - mov edx, [esp + 8 + 8] // dst_argb - mov esi, [esp + 8 + 12] // shuffler - mov ecx, [esp + 8 + 16] // width - pxor xmm5, xmm5 - - mov ebx, [esi] // shuffler - cmp ebx, 0x03000102 - je shuf_3012 - cmp ebx, 0x00010203 - je shuf_0123 - cmp ebx, 0x00030201 - je shuf_0321 - cmp ebx, 0x02010003 - je shuf_2103 - - // TODO(fbarchard): Use one source pointer and 3 offsets. - shuf_any1: - movzx ebx, byte ptr [esi] - movzx ebx, byte ptr [eax + ebx] - mov [edx], bl - movzx ebx, byte ptr [esi + 1] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 1], bl - movzx ebx, byte ptr [esi + 2] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 2], bl - movzx ebx, byte ptr [esi + 3] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 3], bl - lea eax, [eax + 4] - lea edx, [edx + 4] - sub ecx, 1 - jg shuf_any1 - jmp shuf99 - - shuf_0123: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB - pshuflw xmm0, xmm0, 01Bh - pshufhw xmm1, xmm1, 01Bh - pshuflw xmm1, xmm1, 01Bh - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0123 - jmp shuf99 - - shuf_0321: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB - pshuflw xmm0, xmm0, 039h - pshufhw xmm1, xmm1, 039h - pshuflw xmm1, xmm1, 039h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0321 - jmp shuf99 - - shuf_2103: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA - pshuflw xmm0, xmm0, 093h - pshufhw xmm1, xmm1, 093h - pshuflw xmm1, xmm1, 093h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_2103 - jmp shuf99 - - shuf_3012: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB - pshuflw xmm0, xmm0, 0C6h - pshufhw xmm1, xmm1, 0C6h - pshuflw xmm1, xmm1, 0C6h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_3012 - - shuf99: - pop esi - pop ebx - ret - } -} - // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, int width) { __asm { push esi @@ -5921,10 +5814,10 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, int width) { __asm { push esi @@ -5959,8 +5852,8 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { __asm { @@ -5971,7 +5864,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel @@ -6018,8 +5911,8 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { __asm { @@ -6058,8 +5951,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, #ifdef HAS_HALFFLOATROW_SSE2 static float kExpBias = 1.9259299444e-34f; -__declspec(naked) void HalfFloatRow_SSE2(const uint16* src, - uint16* dst, +__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, float scale, int width) { __asm { @@ -6072,7 +5965,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, pxor xmm5, xmm5 sub edx, eax - // 8 pixel loop. + // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts add eax, 16 @@ -6095,8 +5988,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 -__declspec(naked) void HalfFloatRow_AVX2(const uint16* src, - uint16* dst, +__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, float scale, int width) { __asm { @@ -6110,7 +6003,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, vpxor ymm5, ymm5, ymm5 sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts add eax, 32 @@ -6133,8 +6026,8 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, #endif // HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_F16C -__declspec(naked) void HalfFloatRow_F16C(const uint16* src, - uint16* dst, +__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, float scale, int width) { __asm { @@ -6144,7 +6037,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src, mov ecx, [esp + 16] /* width */ sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts @@ -6167,8 +6060,8 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src, #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, - const uint8* table_argb, +__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, int width) { __asm { push esi @@ -6201,8 +6094,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, - const uint8* table_argb, +__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, int width) { __asm { push esi @@ -6233,11 +6126,11 @@ __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, - uint32 lumacoeff) { + const uint8_t* luma, + uint32_t lumacoeff) { __asm { push esi push edi @@ -6252,7 +6145,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 diff --git a/files/source/scale.cc b/files/source/scale.cc index 010ad9d4..ab085496 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -39,12 +39,12 @@ static void ScalePlaneDown2(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_C @@ -103,13 +103,6 @@ static void ScalePlaneDown2(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN2_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; - } -#endif #if defined(HAS_SCALEROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleRowDown2 = @@ -125,6 +118,21 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI + : ScaleRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MMI + : ScaleRowDown2Box_MMI); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -143,12 +151,12 @@ static void ScalePlaneDown2_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = filtering == kFilterNone ? ScaleRowDown2_16_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C @@ -176,12 +184,12 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SSE2); } #endif -#if defined(HAS_SCALEROWDOWN2_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = - filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; +#if defined(HAS_SCALEROWDOWN2_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_16_MMI + : ScaleRowDown2Box_16_MMI); } #endif @@ -206,12 +214,12 @@ static void ScalePlaneDown4(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; int row_stride = src_stride << 2; (void)src_width; @@ -247,13 +255,6 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; - } -#endif #if defined(HAS_SCALEROWDOWN4_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleRowDown4 = @@ -263,6 +264,15 @@ static void ScalePlaneDown4(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN4_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -280,12 +290,12 @@ static void ScalePlaneDown4_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; int row_stride = src_stride << 2; (void)src_width; @@ -306,12 +316,9 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif -#if defined(HAS_SCALEROWDOWN4_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; +#if defined(HAS_SCALEROWDOWN4_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; } #endif @@ -332,14 +339,14 @@ static void ScalePlaneDown34(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; @@ -371,6 +378,26 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MSA; + ScaleRowDown34_1 = ScaleRowDown34_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -391,19 +418,6 @@ static void ScalePlaneDown34(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -434,14 +448,14 @@ static void ScalePlaneDown34_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; @@ -475,19 +489,6 @@ static void ScalePlaneDown34_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -533,14 +534,14 @@ static void ScalePlaneDown38(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; assert(dst_width % 3 == 0); (void)src_width; @@ -592,19 +593,6 @@ static void ScalePlaneDown38(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_DSPR2; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2; - } - } -#endif #if defined(HAS_SCALEROWDOWN38_MSA) if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { @@ -655,14 +643,14 @@ static void ScalePlaneDown38_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; @@ -696,19 +684,6 @@ static void ScalePlaneDown38_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -735,8 +710,8 @@ static void ScalePlaneDown38_16(int src_width, #define MIN1(x) ((x) < 1 ? 1 : (x)) -static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -745,8 +720,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { return sum; } -static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -759,8 +734,8 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, - uint8* dst_ptr) { + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -781,8 +756,8 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, - uint16* dst_ptr) { + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -802,11 +777,12 @@ static void ScaleAddCols2_16_C(int dst_width, static void ScaleAddCols0_C(int dst_width, int boxheight, int x, - int, - const uint16* src_ptr, - uint8* dst_ptr) { + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int scaleval = 65536 / boxheight; int i; + (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { *dst_ptr++ = src_ptr[i] * scaleval >> 16; @@ -817,8 +793,8 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, - uint8* dst_ptr) { + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -833,8 +809,8 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, - uint16* dst_ptr) { + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -857,8 +833,8 @@ static void ScalePlaneBox(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr) { + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -870,14 +846,14 @@ static void ScalePlaneBox(int src_width, &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint16. + // Allocate a row buffer of uint16_t. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) = + const uint16_t* src_ptr, uint8_t* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_C : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = - ScaleAddRow_C; + void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, + int src_width) = ScaleAddRow_C; #if defined(HAS_SCALEADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleAddRow = ScaleAddRow_Any_SSE2; @@ -910,11 +886,11 @@ static void ScalePlaneBox(int src_width, } } #endif -#if defined(HAS_SCALEADDROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - ScaleAddRow = ScaleAddRow_Any_DSPR2; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_DSPR2; +#if defined(HAS_SCALEADDROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleAddRow = ScaleAddRow_Any_MMI; + if (IS_ALIGNED(src_width, 8)) { + ScaleAddRow = ScaleAddRow_MMI; } } #endif @@ -922,7 +898,7 @@ static void ScalePlaneBox(int src_width, for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint8* src = src_ptr + iy * src_stride; + const uint8_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -930,10 +906,10 @@ static void ScalePlaneBox(int src_width, boxheight = MIN1((y >> 16) - iy); memset(row16, 0, src_width * 2); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16*)(row16), src_width); + ScaleAddRow(src, (uint16_t*)(row16), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row16); @@ -946,8 +922,8 @@ static void ScalePlaneBox_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr) { + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -959,13 +935,13 @@ static void ScalePlaneBox_16(int src_width, &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint32. + // Allocate a row buffer of uint32_t. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) = + const uint32_t* src_ptr, uint16_t* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; - void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = - ScaleAddRow_16_C; + void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, + int src_width) = ScaleAddRow_16_C; #if defined(HAS_SCALEADDROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { @@ -973,10 +949,15 @@ static void ScalePlaneBox_16(int src_width, } #endif +#if defined(HAS_SCALEADDROW_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { + ScaleAddRow = ScaleAddRow_16_MMI; + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint16* src = src_ptr + iy * src_stride; + const uint16_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -984,10 +965,10 @@ static void ScalePlaneBox_16(int src_width, boxheight = MIN1((y >> 16) - iy); memset(row32, 0, src_width * 4); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint32*)(row32), src_width); + ScaleAddRow(src, (uint32_t*)(row32), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row32); @@ -1001,8 +982,8 @@ void ScalePlaneBilinearDown(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1015,10 +996,10 @@ void ScalePlaneBilinearDown(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, - int x, int dx) = + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1049,14 +1030,6 @@ void ScalePlaneBilinearDown(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -1065,6 +1038,14 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1079,13 +1060,21 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -1108,8 +1097,8 @@ void ScalePlaneBilinearDown_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1122,10 +1111,10 @@ void ScalePlaneBilinearDown_16(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr, + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr, + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1164,14 +1153,6 @@ void ScalePlaneBilinearDown_16(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1184,13 +1165,13 @@ void ScalePlaneBilinearDown_16(int src_width, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; - InterpolateRow((uint16*)row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; @@ -1208,8 +1189,8 @@ void ScalePlaneBilinearUp(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1218,11 +1199,11 @@ void ScalePlaneBilinearUp(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, - int x, int dx) = + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); @@ -1252,14 +1233,6 @@ void ScalePlaneBilinearUp(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; @@ -1277,6 +1250,14 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) @@ -1284,6 +1265,11 @@ void ScalePlaneBilinearUp(int src_width, ScaleFilterCols = ScaleColsUp2_SSE2; } #endif +#if defined(HAS_SCALECOLS_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_MMI; + } +#endif } if (y > max_y) { @@ -1291,13 +1277,13 @@ void ScalePlaneBilinearUp(int src_width, } { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -1343,8 +1329,8 @@ void ScalePlaneBilinearUp_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1353,10 +1339,10 @@ void ScalePlaneBilinearUp_16(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr, + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr, + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1395,14 +1381,6 @@ void ScalePlaneBilinearUp_16(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; @@ -1419,6 +1397,11 @@ void ScalePlaneBilinearUp_16(int src_width, ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif +#if defined(HAS_SCALECOLS_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleColsUp2_16_MMI; + } +#endif } if (y > max_y) { @@ -1426,13 +1409,13 @@ void ScalePlaneBilinearUp_16(int src_width, } { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 4); - uint16* rowptr = (uint16*)row; + uint16_t* rowptr = (uint16_t*)row; int rowstride = kRowSize; int lasty = yi; @@ -1483,11 +1466,11 @@ static void ScalePlaneSimple(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_ptr, - uint8* dst_ptr) { + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int i; - void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x, - int dx) = ScaleCols_C; + void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -1504,6 +1487,11 @@ static void ScalePlaneSimple(int src_width, ScaleCols = ScaleColsUp2_SSE2; } #endif +#if defined(HAS_SCALECOLS_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_MMI; + } +#endif } for (i = 0; i < dst_height; ++i) { @@ -1519,10 +1507,10 @@ static void ScalePlaneSimple_16(int src_width, int dst_height, int src_stride, int dst_stride, - const uint16* src_ptr, - uint16* dst_ptr) { + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int i; - void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width, + void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1540,6 +1528,11 @@ static void ScalePlaneSimple_16(int src_width, ScaleCols = ScaleColsUp2_16_SSE2; } #endif +#if defined(HAS_SCALECOLS_16_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { + ScaleCols = ScaleColsUp2_16_MMI; + } +#endif } for (i = 0; i < dst_height; ++i) { @@ -1553,11 +1546,11 @@ static void ScalePlaneSimple_16(int src_width, // This function dispatches to a specialized scaler based on scale factor. LIBYUV_API -void ScalePlane(const uint8* src, +void ScalePlane(const uint8_t* src, int src_stride, int src_width, int src_height, - uint8* dst, + uint8_t* dst, int dst_stride, int dst_width, int dst_height, @@ -1636,11 +1629,11 @@ void ScalePlane(const uint8* src, } LIBYUV_API -void ScalePlane_16(const uint16* src, +void ScalePlane_16(const uint16_t* src, int src_stride, int src_width, int src_height, - uint16* dst, + uint16_t* dst, int dst_stride, int dst_width, int dst_height, @@ -1663,7 +1656,7 @@ void ScalePlane_16(const uint16* src, CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } - if (dst_width == src_width) { + if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, @@ -1692,7 +1685,7 @@ void ScalePlane_16(const uint16* src, return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1722,19 +1715,19 @@ void ScalePlane_16(const uint16* src, // This function in turn calls a scaling function for each plane. LIBYUV_API -int I420Scale(const uint8* src_y, +int I420Scale(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, int src_width, int src_height, - uint8* dst_y, + uint8_t* dst_y, int dst_stride_y, - uint8* dst_u, + uint8_t* dst_u, int dst_stride_u, - uint8* dst_v, + uint8_t* dst_v, int dst_stride_v, int dst_width, int dst_height, @@ -1759,19 +1752,19 @@ int I420Scale(const uint8* src_y, } LIBYUV_API -int I420Scale_16(const uint16* src_y, +int I420Scale_16(const uint16_t* src_y, int src_stride_y, - const uint16* src_u, + const uint16_t* src_u, int src_stride_u, - const uint16* src_v, + const uint16_t* src_v, int src_stride_v, int src_width, int src_height, - uint16* dst_y, + uint16_t* dst_y, int dst_stride_y, - uint16* dst_u, + uint16_t* dst_u, int dst_stride_u, - uint16* dst_v, + uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, @@ -1795,19 +1788,88 @@ int I420Scale_16(const uint16* src_y, return 0; } +// Scale an I444 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I444Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I444Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + // Deprecated api LIBYUV_API -int Scale(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, - uint8* dst_y, - uint8* dst_u, - uint8* dst_v, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, int dst_stride_y, int dst_stride_u, int dst_stride_v, @@ -1820,43 +1882,6 @@ int Scale(const uint8* src_y, dst_height, interpolate ? kFilterBox : kFilterNone); } -// Deprecated api -LIBYUV_API -int ScaleOffset(const uint8* src, - int src_width, - int src_height, - uint8* dst, - int dst_width, - int dst_height, - int dst_yoffset, - LIBYUV_BOOL interpolate) { - // Chroma requires offset to multiple of 2. - int dst_yoffset_even = dst_yoffset & ~1; - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - int aheight = dst_height - dst_yoffset_even * 2; // actual output height - const uint8* src_y = src; - const uint8* src_u = src + src_width * src_height; - const uint8* src_v = - src + src_width * src_height + src_halfwidth * src_halfheight; - uint8* dst_y = dst + dst_yoffset_even * dst_width; - uint8* dst_u = - dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth; - uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + - (dst_yoffset_even >> 1) * dst_halfwidth; - if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 || - dst_height <= 0 || dst_yoffset_even < 0 || - dst_yoffset_even >= dst_height) { - return -1; - } - return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth, - src_width, src_height, dst_y, dst_width, dst_u, - dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight, - interpolate ? kFilterBox : kFilterNone); -} - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc index d64ba7a9..17831372 100644 --- a/files/source/scale_any.cc +++ b/files/source/scale_any.cc @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <string.h> // For memset/memcpy + #include "libyuv/scale.h" #include "libyuv/scale_row.h" @@ -19,22 +21,32 @@ extern "C" { #endif // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \ - int dx) { \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx); \ +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ } #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif #ifdef HAS_SCALEARGBCOLS_NEON CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBCOLS_MMI +CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) +#endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, @@ -42,34 +54,42 @@ CANY(ScaleARGBFilterCols_Any_NEON, 4, 3) #endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif #undef CANY // Fixed scale down. -#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ +// Mask may be non-power of 2, so use MOD +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. -#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ +#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ } #ifdef HAS_SCALEROWDOWN2_SSSE3 @@ -150,6 +170,27 @@ SDANY(ScaleRowDown2Box_Any_MSA, 1, 31) #endif +#ifdef HAS_SCALEROWDOWN2_MMI +SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7) +SDANY(ScaleRowDown2Linear_Any_MMI, + ScaleRowDown2Linear_MMI, + ScaleRowDown2Linear_C, + 2, + 1, + 7) +SDANY(ScaleRowDown2Box_Any_MMI, + ScaleRowDown2Box_MMI, + ScaleRowDown2Box_C, + 2, + 1, + 7) +SDODD(ScaleRowDown2Box_Odd_MMI, + ScaleRowDown2Box_MMI, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 7) +#endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, @@ -186,6 +227,15 @@ SDANY(ScaleRowDown4Box_Any_MSA, 1, 15) #endif +#ifdef HAS_SCALEROWDOWN4_MMI +SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_MMI, + ScaleRowDown4Box_MMI, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif #ifdef HAS_SCALEROWDOWN34_SSSE3 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, @@ -226,6 +276,26 @@ SDANY(ScaleRowDown34_1_Box_Any_NEON, 1, 23) #endif +#ifdef HAS_SCALEROWDOWN34_MSA +SDANY(ScaleRowDown34_Any_MSA, + ScaleRowDown34_MSA, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_MSA, + ScaleRowDown34_0_Box_MSA, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_MSA, + ScaleRowDown34_1_Box_MSA, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif #ifdef HAS_SCALEROWDOWN38_SSSE3 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, @@ -347,19 +417,39 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA, 4, 3) #endif +#ifdef HAS_SCALEARGBROWDOWN2_MMI +SDANY(ScaleARGBRowDown2_Any_MMI, + ScaleARGBRowDown2_MMI, + ScaleARGBRowDown2_C, + 2, + 4, + 1) +SDANY(ScaleARGBRowDown2Linear_Any_MMI, + ScaleARGBRowDown2Linear_MMI, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 1) +SDANY(ScaleARGBRowDown2Box_Any_MMI, + ScaleARGBRowDown2Box_MMI, + ScaleARGBRowDown2Box_C, + 2, + 4, + 1) +#endif #undef SDANY // Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ - dst_ptr + n * BPP, r); \ +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ } #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 @@ -398,15 +488,66 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, 4, 3) #endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI +SDAANY(ScaleARGBRowDownEven_Any_MMI, + ScaleARGBRowDownEven_MMI, + ScaleARGBRowDownEven_C, + 4, + 1) +SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, + ScaleARGBRowDownEvenBox_MMI, + ScaleARGBRowDownEvenBox_C, + 4, + 1) +#endif -// Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ +#ifdef SASIMDONLY +// This also works and uses memcpy and SIMD instead of C, but is slower on ARM + +// Add rows box filter scale down. Using macro from row_any +#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint16_t dst_temp[32]); \ + SIMD_ALIGNED(uint8_t src_temp[32]); \ + memset(dst_temp, 0, 32 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + ANY_SIMD(src_ptr, dst_ptr, n); \ } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(src_temp, dst_temp, MASK + 1); \ + memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_MSA +SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) +#endif +#ifdef HAS_SCALEADDROW_MMI +SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) +#endif +#undef SAANY + +#else + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ } #ifdef HAS_SCALEADDROW_SSE2 @@ -421,11 +562,13 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #ifdef HAS_SCALEADDROW_MSA SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) #endif -#ifdef HAS_SCALEADDROW_DSPR2 -SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15) +#ifdef HAS_SCALEADDROW_MMI +SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif #undef SAANY +#endif // SASIMDONLY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc index 1ea28f0d..beef380a 100644 --- a/files/source/scale_argb.cc +++ b/files/source/scale_argb.cc @@ -36,8 +36,8 @@ static void ScaleARGBDown2(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, @@ -45,8 +45,8 @@ static void ScaleARGBDown2(int src_width, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = filtering == kFilterNone ? ScaleARGBRowDown2_C : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C @@ -111,6 +111,22 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI + : ScaleARGBRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 2)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MMI + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI + : ScaleARGBRowDown2Box_MMI); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -131,8 +147,8 @@ static void ScaleARGBDown4Box(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, @@ -142,8 +158,8 @@ static void ScaleARGBDown4Box(int src_width, const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; // Advance to odd row, even column. src_argb += (y >> 16) * src_stride + (x >> 16) * 4; @@ -189,8 +205,8 @@ static void ScaleARGBDownEven(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, @@ -199,8 +215,8 @@ static void ScaleARGBDownEven(int src_width, int j; int col_step = dx >> 16; int row_stride = (dy >> 16) * src_stride; - void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, - int src_step, uint8* dst_argb, int dst_width) = + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; (void)src_width; (void)src_height; @@ -237,6 +253,16 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI + : ScaleARGBRowDownEven_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -255,23 +281,23 @@ static void ScaleARGBBilinearDown(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; - int64 xlast = x + (int64)(dst_width - 1) * dx; - int64 xl = (dx >= 0) ? x : xlast; - int64 xr = (dx >= 0) ? xlast : x; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; int clip_src_width; xl = (xl >> 16) & ~3; // Left edge aligned. xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. @@ -306,15 +332,6 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && - IS_ALIGNED(src_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(clip_src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -336,6 +353,14 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. { @@ -347,7 +372,7 @@ static void ScaleARGBBilinearDown(int src_width, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { @@ -372,18 +397,18 @@ static void ScaleARGBBilinearUp(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; @@ -411,12 +436,6 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && - IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -425,6 +444,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif if (src_width >= 32768) { ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; @@ -442,6 +469,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -455,6 +490,22 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBFilterCols = ScaleARGBCols_MMI; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -462,6 +513,11 @@ static void ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; + } +#endif } if (y > max_y) { @@ -470,13 +526,13 @@ static void ScaleARGBBilinearUp(int src_width, { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -526,18 +582,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_stride_u, int src_stride_v, int dst_stride_argb, - const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; - void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, - const uint8* v_buf, uint8* rgb_buf, int width) = + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -563,15 +619,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; - } -#endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; @@ -581,7 +628,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } #endif - void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -608,12 +655,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) && - IS_ALIGNED(dst_stride_argb, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -623,7 +664,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } #endif - void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb, + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { @@ -643,6 +684,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -656,6 +705,22 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBFilterCols = ScaleARGBCols_MMI; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -663,6 +728,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; + } +#endif } const int max_y = (src_height - 1) << 16; @@ -672,9 +742,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; - const uint8* src_row_y = src_y + yi * src_stride_y; - const uint8* src_row_u = src_u + uv_yi * src_stride_u; - const uint8* src_row_v = src_v + uv_yi * src_stride_v; + const uint8_t* src_row_y = src_y + yi * src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; @@ -683,7 +753,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, // Allocate 1 row of ARGB for source conversion. align_buffer_64(argb_row, src_width * 4); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -755,15 +825,15 @@ static void ScaleARGBSimple(int src_width, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int dx, int y, int dy) { int j; - void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width, - int x, int dx) = + void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; (void)src_height; #if defined(HAS_SCALEARGBCOLS_SSE2) @@ -779,6 +849,22 @@ static void ScaleARGBSimple(int src_width, } } #endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } +#endif +#if defined(HAS_SCALEARGBCOLS_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleARGBCols = ScaleARGBCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleARGBCols = ScaleARGBCols_MMI; + } + } +#endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -786,6 +872,11 @@ static void ScaleARGBSimple(int src_width, ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBColsUp2_MMI; + } +#endif } for (j = 0; j < dst_height; ++j) { @@ -799,11 +890,11 @@ static void ScaleARGBSimple(int src_width, // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. -static void ScaleARGB(const uint8* src, +static void ScaleARGB(const uint8_t* src, int src_stride, int src_width, int src_height, - uint8* dst, + uint8_t* dst, int dst_stride, int dst_width, int dst_height, @@ -832,13 +923,13 @@ static void ScaleARGB(const uint8* src, &dx, &dy); src_width = Abs(src_width); if (clip_x) { - int64 clipf = (int64)(clip_x)*dx; + int64_t clipf = (int64_t)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 4; dst += clip_x * 4; } if (clip_y) { - int64 clipf = (int64)(clip_y)*dy; + int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * src_stride; dst += clip_y * dst_stride; @@ -904,11 +995,11 @@ static void ScaleARGB(const uint8* src, } LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, +int ARGBScaleClip(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, @@ -932,11 +1023,11 @@ int ARGBScaleClip(const uint8* src_argb, // Scale an ARGB image. LIBYUV_API -int ARGBScale(const uint8* src_argb, +int ARGBScale(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, @@ -953,18 +1044,18 @@ int ARGBScale(const uint8* src_argb, // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, +int YUVToARGBScaleClip(const uint8_t* src_y, int src_stride_y, - const uint8* src_u, + const uint8_t* src_u, int src_stride_u, - const uint8* src_v, + const uint8_t* src_v, int src_stride_v, - uint32 src_fourcc, + uint32_t src_fourcc, int src_width, int src_height, - uint8* dst_argb, + uint8_t* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, + uint32_t dst_fourcc, int dst_width, int dst_height, int clip_x, @@ -972,7 +1063,7 @@ int YUVToARGBScaleClip(const uint8* src_y, int clip_width, int clip_height, enum FilterMode filtering) { - uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4); + uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); int r; (void)src_fourcc; // TODO(fbarchard): implement and/or assert. (void)dst_fourcc; diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc index 1bef39df..63690271 100644 --- a/files/source/scale_common.cc +++ b/files/source/scale_common.cc @@ -28,9 +28,9 @@ static __inline int Abs(int v) { } // CPU agnostic row functions -void ScaleRowDown2_C(const uint8* src_ptr, +void ScaleRowDown2_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { int x; (void)src_stride; @@ -45,9 +45,9 @@ void ScaleRowDown2_C(const uint8* src_ptr, } } -void ScaleRowDown2_16_C(const uint16* src_ptr, +void ScaleRowDown2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { int x; (void)src_stride; @@ -62,11 +62,11 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, } } -void ScaleRowDown2Linear_C(const uint8* src_ptr, +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - const uint8* s = src_ptr; + const uint8_t* s = src_ptr; int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { @@ -80,11 +80,11 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, } } -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { - const uint16* s = src_ptr; + const uint16_t* s = src_ptr; int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { @@ -98,12 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, } } -void ScaleRowDown2Box_C(const uint8* src_ptr, +void ScaleRowDown2Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -117,12 +117,12 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, } } -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; dst_width -= 1; for (x = 0; x < dst_width - 1; x += 2) { @@ -141,12 +141,12 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, dst[0] = (s[0] + t[0] + 1) >> 1; } -void ScaleRowDown2Box_16_C(const uint16* src_ptr, +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -160,9 +160,9 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, } } -void ScaleRowDown4_C(const uint8* src_ptr, +void ScaleRowDown4_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { int x; (void)src_stride; @@ -177,9 +177,9 @@ void ScaleRowDown4_C(const uint8* src_ptr, } } -void ScaleRowDown4_16_C(const uint16* src_ptr, +void ScaleRowDown4_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { int x; (void)src_stride; @@ -194,9 +194,9 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, } } -void ScaleRowDown4Box_C(const uint8* src_ptr, +void ScaleRowDown4Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { intptr_t stride = src_stride; int x; @@ -232,9 +232,9 @@ void ScaleRowDown4Box_C(const uint8* src_ptr, } } -void ScaleRowDown4Box_16_C(const uint16* src_ptr, +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { intptr_t stride = src_stride; int x; @@ -270,9 +270,9 @@ void ScaleRowDown4Box_16_C(const uint16* src_ptr, } } -void ScaleRowDown34_C(const uint8* src_ptr, +void ScaleRowDown34_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { int x; (void)src_stride; @@ -286,9 +286,9 @@ void ScaleRowDown34_C(const uint8* src_ptr, } } -void ScaleRowDown34_16_C(const uint16* src_ptr, +void ScaleRowDown34_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { int x; (void)src_stride; @@ -303,21 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, } // Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* d, + uint8_t* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -327,21 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, } } -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* d, + uint16_t* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -352,21 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, } // Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* d, + uint8_t* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -376,21 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, } } -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* d, + uint16_t* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -401,8 +401,8 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, } // Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { @@ -419,8 +419,8 @@ void ScaleCols_C(uint8* dst_ptr, } } -void ScaleCols_16_C(uint16* dst_ptr, - const uint16* src_ptr, +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, int dst_width, int x, int dx) { @@ -438,8 +438,8 @@ void ScaleCols_16_C(uint16* dst_ptr, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { @@ -456,8 +456,8 @@ void ScaleColsUp2_C(uint8* dst_ptr, } } -void ScaleColsUp2_16_C(uint16* dst_ptr, - const uint16* src_ptr, +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, int dst_width, int x, int dx) { @@ -477,15 +477,15 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, // (1-f)a + fb can be replaced with a + f(b-a) #if defined(__arm__) || defined(__aarch64__) #define BLENDER(a, b, f) \ - (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) #else // Intel uses 7 bit math with rounding. #define BLENDER(a, b, f) \ - (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) #endif -void ScaleFilterCols_C(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { @@ -511,15 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, } } -void ScaleFilterCols64_C(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x32, int dx) { - int64 x = (int64)(x32); + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -532,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -540,12 +540,14 @@ void ScaleFilterCols64_C(uint8* dst_ptr, } #undef BLENDER -// Same as 8 bit arm blender but return is cast to uint16 +// Same as 8 bit arm blender but return is cast to uint16_t #define BLENDER(a, b, f) \ - (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + (uint16_t)( \ + (int)(a) + \ + (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_16_C(uint16* dst_ptr, - const uint16* src_ptr, +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, int dst_width, int x, int dx) { @@ -571,15 +573,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, } } -void ScaleFilterCols64_16_C(uint16* dst_ptr, - const uint16* src_ptr, +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, int dst_width, int x32, int dx) { - int64 x = (int64)(x32); + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -592,7 +594,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -600,9 +602,9 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, } #undef BLENDER -void ScaleRowDown38_C(const uint8* src_ptr, +void ScaleRowDown38_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { int x; (void)src_stride; @@ -616,9 +618,9 @@ void ScaleRowDown38_C(const uint8* src_ptr, } } -void ScaleRowDown38_16_C(const uint16* src_ptr, +void ScaleRowDown38_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst, + uint16_t* dst, int dst_width) { int x; (void)src_stride; @@ -633,9 +635,9 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, } // 8x3 -> 3x1 -void ScaleRowDown38_3_Box_C(const uint8* src_ptr, +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; @@ -663,9 +665,9 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr, } } -void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, + uint16_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; @@ -694,9 +696,9 @@ void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, } // 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; @@ -719,9 +721,9 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, } } -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, + uint16_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; @@ -744,7 +746,7 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, } } -void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -758,7 +760,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } } -void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -772,12 +776,12 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { } } -void ScaleARGBRowDown2_C(const uint8* src_argb, +void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { @@ -791,9 +795,9 @@ void ScaleARGBRowDown2_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Linear_C(const uint8* src_argb, +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { int x; (void)src_stride; @@ -807,9 +811,9 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Box_C(const uint8* src_argb, +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { @@ -830,13 +834,13 @@ void ScaleARGBRowDown2Box_C(const uint8* src_argb, } } -void ScaleARGBRowDownEven_C(const uint8* src_argb, +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { @@ -850,10 +854,10 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, } } -void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { @@ -875,13 +879,13 @@ void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, } // Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -895,14 +899,14 @@ void ScaleARGBCols_C(uint8* dst_argb, } } -void ScaleARGBCols64_C(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -917,13 +921,13 @@ void ScaleARGBCols64_C(uint8* dst_argb, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; (void)x; (void)dx; @@ -941,24 +945,24 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, // Mimics SSSE3 blender #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 #define BLENDERC(a, b, f, s) \ - (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) + (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) #define BLENDER(a, b, f) \ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ BLENDERC(a, b, f, 0) -void ScaleARGBFilterCols_C(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -972,26 +976,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, if (dst_width & 1) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } -void ScaleARGBFilterCols64_C(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -1003,10 +1007,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, dst += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } @@ -1020,8 +1024,8 @@ void ScalePlaneVertical(int src_height, int dst_height, int src_stride, int dst_stride, - const uint8* src_argb, - uint8* dst_argb, + const uint8_t* src_argb, + uint8_t* dst_argb, int x, int y, int dy, @@ -1029,7 +1033,7 @@ void ScalePlaneVertical(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1063,16 +1067,6 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && - IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -1081,6 +1075,14 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width_bytes, 8)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; @@ -1100,8 +1102,8 @@ void ScalePlaneVertical_16(int src_height, int dst_height, int src_stride, int dst_stride, - const uint16* src_argb, - uint16* dst_argb, + const uint16_t* src_argb, + uint16_t* dst_argb, int x, int y, int dy, @@ -1109,7 +1111,7 @@ void ScalePlaneVertical_16(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb, + void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1151,16 +1153,6 @@ void ScalePlaneVertical_16(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && - IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif for (j = 0; j < dst_height; ++j) { int yi; int yf; @@ -1222,12 +1214,12 @@ enum FilterMode ScaleFilterReduce(int src_width, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div) { - return (int)(((int64)(num) << 16) / div); + return (int)(((int64_t)(num) << 16) / div); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { - return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1)); + return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); } #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) @@ -1306,6 +1298,35 @@ void ScaleSlope(int src_width, } #undef CENTERSTART +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc index f0ac56fc..90a49f30 100644 --- a/files/source/scale_gcc.cc +++ b/files/source/scale_gcc.cc @@ -21,462 +21,458 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8* src_ptr, +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown2_AVX2(const uint8* src_ptr, +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8* src_ptr, +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { intptr_t stridex3; - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #ifdef HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown4_AVX2(const uint8* src_ptr, +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(src_stride * 3)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( @@ -487,34 +483,35 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, : "m"(kShuf0), // %0 "m"(kShuf1), // %1 "m"(kShuf2) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 @@ -524,7 +521,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 - ); + ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 @@ -533,54 +530,54 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 @@ -590,7 +587,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 - ); + ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 @@ -599,90 +596,89 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 - ); - - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); + + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown38_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x8,1) " \n" - "lea " MEMLEA(0xc,1) ",%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" @@ -694,41 +690,40 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, "m"(kShufAb1), // %1 "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" @@ -739,530 +734,534 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + + "pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n" - "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { intptr_t x0, x1, temp_pixel; - asm volatile ( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2," MEMACCESS(0) " \n" - "lea " MEMLEA(0x2,0) ",%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2," MEMACCESS(0) " \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 #if defined(__x86_64__) - "+rm"(dst_width) // %5 + "+rm"(dst_width) // %5 #else - "+m"(dst_width) // %5 + "+m"(dst_width) // %5 #endif - : "rm"(x), // %6 - "rm"(dx), // %7 + : "rm"(x), // %6 + "rm"(dx), // %7 #if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 #else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 #endif - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { (void)x; (void)dx; - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; (void)src_stride; - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - LABELALIGN - "1: \n" - "movd " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 - MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" - - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 - MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 - MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "movq " MEMACCESS(5) ",%%xmm2 \n" - MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 - MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 - MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } -void ScaleARGBCols_SSE2(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { intptr_t x0, x1; - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - LABELALIGN - "40: \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 - MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x8,2) ",%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "99: \n" + : "=&a"(x0), // %0 + "=&d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { (void)x; (void)dx; - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { +static const uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { +static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { @@ -1273,69 +1272,67 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 - ); - - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "psrlw $0x9,%%xmm1 \n" - MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(0) " \n" - - LABELALIGN - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); + + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN "99: \n" // clang-format error. + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "=&r"(x0), // %3 + "=&r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Divide num by div and return as 16.16 fixed point result. diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc new file mode 100644 index 00000000..990463c2 --- /dev/null +++ b/files/source/scale_mmi.cc @@ -0,0 +1,1113 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Mips MMI. +#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +// clang-format off + +// CPU agnostic row functions +void ScaleRowDown2_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + const uint64_t shift = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "psrlh %[src0], %[src0], %[shift] \n\t" + + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "psrlh %[src1], %[src1], %[shift] \n\t" + + "packushb %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift) + : "memory"); +} + +void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest0, dest1; + + const uint64_t mask = 0x00ff00ff00ff00ffULL; + const uint64_t shift = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "and %[dest0], %[src0], %[mask] \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "and %[dest1], %[src1], %[mask] \n\t" + "packushb %[dest0], %[dest0], %[dest1] \n\t" + + "psrlh %[src0], %[src0], %[shift] \n\t" + "psrlh %[src1], %[src1], %[shift] \n\t" + "packushb %[dest1], %[src0], %[src1] \n\t" + + "pavgb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), + [dest1] "=&f"(dest1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), + [shift] "f"(shift), [width] "r"(dst_width) + : "memory"); +} + +void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + + uint64_t s0, s1, t0, t1; + uint64_t dest, dest0, dest1; + + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t mask = 0x00ff00ff00ff00ffULL; + const uint64_t shift0 = 0x2ULL; + const uint64_t shift1 = 0x8ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" + "psrlh %[s1], %[s0], %[shift1] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" + "psrlh %[t1], %[t0], %[shift1] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddh %[dest0], %[s0], %[s1] \n\t" + "paddh %[dest0], %[dest0], %[t0] \n\t" + "paddh %[dest0], %[dest0], %[t1] \n\t" + "paddh %[dest0], %[dest0], %[ph] \n\t" + "psrlh %[dest0], %[dest0], %[shift0] \n\t" + + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" + "psrlh %[s1], %[s0], %[shift1] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" + "psrlh %[t1], %[t0], %[shift1] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddh %[dest1], %[s0], %[s1] \n\t" + "paddh %[dest1], %[dest1], %[t0] \n\t" + "paddh %[dest1], %[dest1], %[t1] \n\t" + "paddh %[dest1], %[dest1], %[ph] \n\t" + "psrlh %[dest1], %[dest1], %[shift0] \n\t" + + "packushb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), + [mask] "f"(mask) + : "memory"); +} + +void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "punpckhwd %[dest], %[src0], %[src1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" + "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" + "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" + "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" + "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" + "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" + + "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + + uint64_t s0, s_hi, s_lo; + uint64_t t0, t_hi, t_lo; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t mask = 0x0ULL; + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t shfit = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" + "punpcklbh %[s_lo], %[s0], %[mask] \n\t" + "punpckhbh %[s_hi], %[s0], %[mask] \n\t" + "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" + + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" + "punpcklbh %[t_lo], %[t0], %[mask] \n\t" + "punpckhbh %[t_hi], %[t0], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" + "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" + + "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" + "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" + + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" + "punpcklbh %[s_lo], %[s0], %[mask] \n\t" + "punpckhbh %[s_hi], %[s0], %[mask] \n\t" + "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" + + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" + "punpcklbh %[t_lo], %[t0], %[mask] \n\t" + "punpckhbh %[t_hi], %[t0], %[mask] \n\t" + "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" + + "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" + "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), + [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), + [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) + : "memory"); +} + +void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + const uint64_t shift = 0x10ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + + "packsswh %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift) + : "memory"); +} + +void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" + "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" + + "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" + "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" + + "pavgh %[dest], %[src0], %[src1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) + : "memory"); +} + +void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + + uint64_t s0, s1, s_hi, s_lo; + uint64_t t0, t1, t_hi, t_lo; + uint64_t dest, dest0, dest1; + + const uint64_t ph = 0x0000000200000002ULL; + const uint64_t mask = 0x0000ffff0000ffffULL; + const uint64_t shift0 = 0x10ULL; + const uint64_t shift1 = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" + "psrlw %[s1], %[s0], %[shift0] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" + "psrlw %[t1], %[t0], %[shift0] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddw %[dest0], %[s0], %[s1] \n\t" + "paddw %[dest0], %[dest0], %[t0] \n\t" + "paddw %[dest0], %[dest0], %[t1] \n\t" + "paddw %[dest0], %[dest0], %[ph] \n\t" + "psrlw %[dest0], %[dest0], %[shift1] \n\t" + + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" + "psrlw %[s1], %[s0], %[shift0] \n\t" + "and %[s0], %[s0], %[mask] \n\t" + + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" + "psrlw %[t1], %[t0], %[shift0] \n\t" + "and %[t0], %[t0], %[mask] \n\t" + + "paddw %[dest1], %[s0], %[s1] \n\t" + "paddw %[dest1], %[dest1], %[t0] \n\t" + "paddw %[dest1], %[dest1], %[t1] \n\t" + "paddw %[dest1], %[dest1], %[ph] \n\t" + "psrlw %[dest1], %[dest1], %[shift1] \n\t" + + "packsswh %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[s], %[s], 0x10 \n\t" + "daddiu %[t], %[t], 0x10 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), + [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), + [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), + [dest] "=&f"(dest) + : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), + [mask] "f"(mask) + : "memory"); +} + +void ScaleRowDown4_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t shift = 0x10ULL; + const uint64_t mask = 0x000000ff000000ffULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + "and %[src0], %[src0], %[mask] \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + "and %[src1], %[src1], %[mask] \n\t" + "packsswh %[dest_lo], %[src0], %[src1] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" + "psrlw %[src0], %[src0], %[shift] \n\t" + "and %[src0], %[src0], %[mask] \n\t" + "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" + "psrlw %[src1], %[src1], %[shift] \n\t" + "and %[src1], %[src1], %[mask] \n\t" + "packsswh %[dest_hi], %[src0], %[src1] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [shift] "f"(shift), [mask] "f"(mask) + : "memory"); +} + +void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1; + uint64_t dest, dest_hi, dest_lo; + + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" + "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" + + "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" + "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" + "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), + [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [mask] "f"(mask) + : "memory"); +} + +#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + +#define DO_SCALEROWDOWN4BOX_LOOP(reg) \ + "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ + "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ + \ + "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_PUNPCKADD() \ + \ + "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ + "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ + "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ + "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ + "paddh " #reg ", " #reg ", %[ph] \n\t" \ + "psrlh " #reg ", " #reg ", %[shift] \n\t" \ + \ + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ + "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" + +/* LibYUVScaleTest.ScaleDownBy4_Box */ +void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* src0_ptr = src_ptr; + const uint8_t* src1_ptr = src_ptr + src_stride; + const uint8_t* src2_ptr = src_ptr + src_stride * 2; + const uint8_t* src3_ptr = src_ptr + src_stride * 3; + + uint64_t src, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; + + const uint64_t mask0 = 0x0ULL; + const uint64_t mask1 = 0x0001000100010001ULL; + const uint64_t ph = 0x0008000800080008ULL; + const uint64_t shift = 0x4ULL; + + __asm__ volatile( + "1: \n\t" + + DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) + DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) + + "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" + "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), + [ph] "f"(ph), [mask1] "f"(mask1) + : "memory"); +} + +#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + +#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ + "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ + "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ + "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ + \ + "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ + DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ + \ + "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ + "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ + "paddw %[dest], %[dest_hi], %[dest] \n\t" \ + "paddw %[dest], %[dest], %[ph] \n\t" \ + "psraw %[dest], %[dest], %[shift] \n\t" \ + "and " #reg ", %[dest], %[mask1] \n\t" \ + \ + "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ + "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" + +/* LibYUVScaleTest.ScaleDownBy4_Box_16 */ +void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src0_ptr = src_ptr; + const uint16_t* src1_ptr = src_ptr + src_stride; + const uint16_t* src2_ptr = src_ptr + src_stride * 2; + const uint16_t* src3_ptr = src_ptr + src_stride * 3; + + uint64_t src, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; + + const uint64_t mask0 = 0x0ULL; + const uint64_t mask1 = 0x00000000ffffffffULL; + const uint64_t ph = 0x0000000800000008ULL; + const uint64_t shift = 0x04ULL; + + __asm__ volatile( + "1: \n\t" + + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) + DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) + "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" + "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" + + "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), + [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), + [ph] "f"(ph), [mask1] "f"(mask1) + : "memory"); +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_MMI(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + uint64_t src, dest; + + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) + : "memory"); +} + +void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { + uint64_t src, dest; + + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + + "punpcklhw %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "punpckhhw %[dest], %[src], %[src] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src] "=&f"(src), [dest] "=&f"(dest) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) + : "memory"); +} + +void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + uint64_t src, src_hi, src_lo, dest0, dest1; + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "punpcklbh %[src_lo], %[src], %[mask] \n\t" + "punpckhbh %[src_hi], %[src], %[mask] \n\t" + + "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "paddush %[dest0], %[dest0], %[src_lo] \n\t" + "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "paddush %[dest1], %[dest1], %[src_hi] \n\t" + + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [src] "=&f"(src) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), + [mask] "f"(mask) + : "memory"); +} + +void ScaleAddRow_16_MMI(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { + uint64_t src, src_hi, src_lo, dest0, dest1; + const uint64_t mask = 0x0ULL; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "punpcklhw %[src_lo], %[src], %[mask] \n\t" + "punpckhhw %[src_hi], %[src], %[mask] \n\t" + + "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "paddw %[dest0], %[dest0], %[src_lo] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + + "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "paddw %[dest1], %[dest1], %[src_hi] \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), + [src_lo] "=&f"(src_lo), [src] "=&f"(src) + : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), + [mask] "f"(mask) + : "memory"); +} + +void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + + uint64_t src0, src1, dest; + + __asm__ volatile( + "1: \n\t" + "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" + "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" + "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" + "punpcklwd %[dest], %[src0], %[src1] \n\t" + + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), + [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) + : "memory"); +} + +void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + const uint8_t* src0_ptr = src_argb; + const uint8_t* src1_ptr = src_argb + src_stride; + + uint64_t src0, src1, src_hi, src_lo; + uint64_t dest, dest_hi, dest_lo, dest0, dest1; + + const uint64_t mask = 0x0ULL; + const uint64_t ph = 0x0002000200020002ULL; + const uint64_t shift = 0x2ULL; + + __asm__ volatile( + "1: \n\t" + + "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" + "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" + + "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src_lo], %[src1], %[mask] \n\t" + "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" + "punpcklbh %[src_hi], %[src1], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" + "paddh %[dest0], %[dest0], %[ph] \n\t" + "psrlh %[dest0], %[dest0], %[shift] \n\t" + + "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" + "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" + + "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" + "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" + "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" + "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" + + "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" + "punpcklbh %[src_lo], %[src1], %[mask] \n\t" + "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" + "punpcklbh %[src_hi], %[src1], %[mask] \n\t" + "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" + "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" + "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" + "paddh %[dest1], %[dest1], %[ph] \n\t" + "psrlh %[dest1], %[dest1], %[shift] \n\t" + + "packushb %[dest], %[dest0], %[dest1] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" + "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" + "daddi %[width], %[width], -0x02 \n\t" + "bnez %[width], 1b \n\t" + : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), + [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), + [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), + [src1] "=&f"(src1), [dest] "=&f"(dest) + : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), + [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), + [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), + [ph] "f"(ph) + : "memory"); +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + + const uint32_t* src_tmp; + + uint64_t dest, offset; + + const uint64_t shift0 = 16; + const uint64_t shift1 = 2; + + __asm__ volatile( + "1: \n\t" + "srav %[offset], %[x], %[shift0] \n\t" + "sllv %[offset], %[offset], %[shift1] \n\t" + "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" + "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" + "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + "dadd %[x], %[x], %[dx] \n\t" + + "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" + "daddi %[width], %[width], -0x01 \n\t" + "bnez %[width], 1b \n\t" + : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) + : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), + [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) + : "memory"); +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + uint64_t src, dest0, dest1; + (void)x; + (void)dx; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" + "punpcklwd %[dest0], %[src], %[src] \n\t" + "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "punpckhwd %[dest1], %[src], %[src] \n\t" + "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) + : "memory"); +} + +// Divide num by div and return as 16.16 fixed point result. +/* LibYUVBaseTest.TestFixedDiv */ +int FixedDiv_MIPS(int num, int div) { + int quotient = 0; + const int shift = 16; + + asm( + "dsll %[num], %[num], %[shift] \n\t" + "ddiv %[num], %[div] \t\n" + "mflo %[quo] \t\n" + : [quo] "+&r"(quotient) + : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); + + return quotient; +} + +// Divide num by div and return as 16.16 fixed point result. +/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ +int FixedDiv1_MIPS(int num, int div) { + int quotient = 0; + const int shift = 16; + const int val1 = 1; + const int64_t val11 = 0x00010001ULL; + + asm( + "dsll %[num], %[num], %[shift] \n\t" + "dsub %[num], %[num], %[val11] \n\t" + "dsub %[div], %[div], %[val1] \n\t" + "ddiv %[num], %[div] \t\n" + "mflo %[quo] \t\n" + : [quo] "+&r"(quotient) + : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), + [shift] "r"(shift)); + + return quotient; +} + +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2_ptr = src_ptr + src_stride; + + uint64_t src0, src1; + uint64_t dest, dest04, dest15, dest26, dest37; + uint64_t tmp0, tmp1, tmp2, tmp3; + + const uint64_t mask0 = 0x0003000900030009ULL; + const uint64_t mask1 = 0x0001000300010003ULL; + const uint64_t mask2 = 0x0009000300090003ULL; + const uint64_t mask3 = 0x0003000100030001ULL; + const uint64_t ph = 0x0000000800000008ULL; + const uint64_t shift = 4; + + __asm__ volatile( + "1: \n\t" + "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" + "pmaddhw %[dest04], %[src0], %[mask0] \n\t" + "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" + "pmaddhw %[dest], %[src1], %[mask1] \n\t" + "paddw %[dest04], %[dest04], %[dest] \n\t" + "paddw %[dest04], %[dest04], %[ph] \n\t" + "psrlw %[dest04], %[dest04], %[shift] \n\t" + + "pmaddhw %[dest15], %[src0], %[mask2] \n\t" + "pmaddhw %[dest], %[src1], %[mask3] \n\t" + "paddw %[dest15], %[dest15], %[dest] \n\t" + "paddw %[dest15], %[dest15], %[ph] \n\t" + "psrlw %[dest15], %[dest15], %[shift] \n\t" + + "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" + "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" + "pmaddhw %[dest26], %[src0], %[mask0] \n\t" + "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" + "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" + "pmaddhw %[dest], %[src1], %[mask1] \n\t" + "paddw %[dest26], %[dest26], %[dest] \n\t" + "paddw %[dest26], %[dest26], %[ph] \n\t" + "psrlw %[dest26], %[dest26], %[shift] \n\t" + + "pmaddhw %[dest37], %[src0], %[mask2] \n\t" + "pmaddhw %[dest], %[src1], %[mask3] \n\t" + "paddw %[dest37], %[dest37], %[dest] \n\t" + "paddw %[dest37], %[dest37], %[ph] \n\t" + "psrlw %[dest37], %[dest37], %[shift] \n\t" + + /* tmp0 = ( 00 04 02 06 ) */ + "packsswh %[tmp0], %[dest04], %[dest26] \n\t" + /* tmp1 = ( 01 05 03 07 ) */ + "packsswh %[tmp1], %[dest15], %[dest37] \n\t" + + /* tmp2 = ( 00 01 04 05 )*/ + "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" + /* tmp3 = ( 02 03 06 07 )*/ + "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" + + /* ( 00 01 02 03 ) */ + "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + + /* ( 04 05 06 07 ) */ + "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" + "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x08 \n\t" + "bnez %[width], 1b \n\t" + : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), + [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), + [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), + [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) + : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), + [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), + [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) + : "memory"); +} + +// clang-format on + +#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/scale_msa.cc b/files/source/scale_msa.cc index bfcd10fc..482a521f 100644 --- a/files/source/scale_msa.cc +++ b/files/source/scale_msa.cc @@ -21,6 +21,14 @@ namespace libyuv { extern "C" { #endif +#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ + { \ + out0[0] = srcp[indx0[0]]; \ + out0[1] = srcp[indx0[1]]; \ + out0[2] = srcp[indx0[2]]; \ + out0[3] = srcp[indx0[3]]; \ + } + void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -119,13 +127,13 @@ void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, } } -void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { int x; - const uint8* nxt_argb = src_argb + src_stride; + const uint8_t* nxt_argb = src_argb + src_stride; int32_t stepx = src_stepx * 4; int64_t data0, data1, data2, data3; v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; @@ -545,6 +553,394 @@ void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { } } +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + v4i32 vec_x = __msa_fill_w(x); + v4i32 vec_dx = __msa_fill_w(dx); + v4i32 vec_const = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 reg0, reg1; + v16u8 dst0; + v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); + v4i32 const_0x40 = __msa_fill_w(0x40); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 16) { + vec2 = vec_x >> 16; + vec6 = vec_x & const_0xFFFF; + vec_x += vec1; + vec3 = vec_x >> 16; + vec7 = vec_x & const_0xFFFF; + vec_x += vec1; + vec4 = vec_x >> 16; + vec8 = vec_x & const_0xFFFF; + vec_x += vec1; + vec5 = vec_x >> 16; + vec9 = vec_x & const_0xFFFF; + vec_x += vec1; + vec6 >>= 9; + vec7 >>= 9; + vec8 >>= 9; + vec9 >>= 9; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); + vec2 += 1; + vec3 += 1; + vec4 += 1; + vec5 += 1; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); + tmp4 -= tmp0; + tmp5 -= tmp1; + tmp6 -= tmp2; + tmp7 -= tmp3; + tmp4 *= vec6; + tmp5 *= vec7; + tmp6 *= vec8; + tmp7 *= vec9; + tmp4 += const_0x40; + tmp5 += const_0x40; + tmp6 += const_0x40; + tmp7 += const_0x40; + tmp4 >>= 7; + tmp5 >>= 7; + tmp6 >>= 7; + tmp7 >>= 7; + tmp0 += tmp4; + tmp1 += tmp5; + tmp2 += tmp6; + tmp3 += tmp7; + reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + __msa_st_b(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + v4i32 x_vec = __msa_fill_w(x); + v4i32 dx_vec = __msa_fill_w(dx); + v4i32 const_vec = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2; + v4i32 dst0; + + vec0 = dx_vec * const_vec; + vec1 = dx_vec * 4; + x_vec += vec0; + + for (j = 0; j < dst_width; j += 4) { + vec2 = x_vec >> 16; + x_vec += vec1; + LOAD_INDEXED_DATA(src, vec2, dst0); + __msa_st_w(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + int j; + v4u32 src0, src1, src2, src3; + v4u32 vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 mult0, mult1, mult2, mult3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1; + v4u32 vec_x = (v4u32)__msa_fill_w(x); + v4u32 vec_dx = (v4u32)__msa_fill_w(dx); + v4u32 vec_const = {0, 1, 2, 3}; + v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 8) { + vec2 = vec_x >> 16; + reg0 = (v16u8)(vec_x >> 9); + vec_x += vec1; + vec3 = vec_x >> 16; + reg1 = (v16u8)(vec_x >> 9); + vec_x += vec1; + reg0 = reg0 & const_0x7f; + reg1 = reg1 & const_0x7f; + reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); + reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); + reg2 = reg0 ^ const_0x7f; + reg3 = reg1 ^ const_0x7f; + mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); + mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); + mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); + mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); + LOAD_INDEXED_DATA(src, vec2, src0); + LOAD_INDEXED_DATA(src, vec3, src1); + vec2 += 1; + vec3 += 1; + LOAD_INDEXED_DATA(src, vec2, src2); + LOAD_INDEXED_DATA(src, vec3, src3); + reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + tmp0 = __msa_dotp_u_h(reg4, mult0); + tmp1 = __msa_dotp_u_h(reg5, mult1); + tmp2 = __msa_dotp_u_h(reg6, mult2); + tmp3 = __msa_dotp_u_h(reg7, mult3); + tmp0 >>= 7; + tmp1 >>= 7; + tmp2 >>= 7; + tmp3 >>= 7; + dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + __msa_st_b(dst0, dst_argb, 0); + __msa_st_b(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + v16u8 src0, src1, src2, src3; + v16u8 vec0, vec1, vec2; + v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; + v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; + v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, + 21, 23, 24, 25, 27, 28, 29, 31}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); + __msa_st_b((v16i8)vec0, dst, 0); + __msa_st_b((v16i8)vec1, dst, 16); + __msa_st_b((v16i8)vec2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 = reg0 * 3 + reg6; + reg1 = reg1 * 3 + reg7; + reg2 = reg2 * 3 + reg8; + reg3 = reg3 * 3 + reg9; + reg4 = reg4 * 3 + reg10; + reg5 = reg5 * 3 + reg11; + reg0 = __msa_srari_h(reg0, 2); + reg1 = __msa_srari_h(reg1, 2); + reg2 = __msa_srari_h(reg2, 2); + reg3 = __msa_srari_h(reg3, 2); + reg4 = __msa_srari_h(reg4, 2); + reg5 = __msa_srari_h(reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 += reg6; + reg1 += reg7; + reg2 += reg8; + reg3 += reg9; + reg4 += reg10; + reg5 += reg11; + reg0 = __msa_srari_h(reg0, 1); + reg1 = __msa_srari_h(reg1, 1); + reg2 = __msa_srari_h(reg2, 1); + reg3 = __msa_srari_h(reg3, 1); + reg4 = __msa_srari_h(reg4, 1); + reg5 = __msa_srari_h(reg5, 1); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc index 9b4dce33..366b155b 100644 --- a/files/source/scale_neon.cc +++ b/files/source/scale_neon.cc @@ -23,590 +23,529 @@ extern "C" { // Provided by Fritz Koenig // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, +void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc - "subs %2, %2, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // add adjacent - "vpaddl.u8 q1, q1 \n" - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - MEMACCESS(1) - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List + asm volatile( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + // row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + // pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, +void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc" - ); + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - MEMACCESS(3) - "vld1.8 {q1}, [%3]! \n" - MEMACCESS(4) - "vld1.8 {q2}, [%4]! \n" - MEMACCESS(5) - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - MEMACCESS(1) - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 - : - : "q0", "q1", "q2", "q3", "memory", "cc" - ); + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc" - ); + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" - - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" - - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" - - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" - ); + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", + "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" - - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" - ); + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); } #define HAS_SCALEROWDOWN38_NEON -static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; -static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, - 18, 6, 14, 19, 0, 0, 0, 0}; -static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; -static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - MEMACCESS(3) - "vld1.8 {q3}, [%3] \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" - ); + asm volatile( + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; - - asm volatile ( - MEMACCESS(5) - "vld1.16 {q13}, [%5] \n" - MEMACCESS(6) - "vld1.8 {q14}, [%6] \n" - MEMACCESS(7) - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - MEMACCESS(4) - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" - ); + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile( + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", + "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(4) - "vld1.16 {q13}, [%4] \n" - MEMACCESS(5) - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" - ); + asm volatile( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint16* dst_ptr, - int src_width, - int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov r12, %5 \n" - "veor q2, q2, q2 \n" - "veor q3, q3, q3 \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "vld1.8 {q0}, [%0], %3 \n" - "vaddw.u8 q3, q3, d1 \n" - "vaddw.u8 q2, q2, d0 \n" - "subs r12, r12, #1 \n" - "bgt 2b \n" - MEMACCESS(2) - "vst1.16 {q2, q3}, [%2]! \n" // store pixels - "add %1, %1, #16 \n" - "subs %4, %4, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + "1: \n" + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2" // Clobber List ); } -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" -// clang-format on +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" // The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8)((int)(a) + +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; + const uint8_t* src_tmp = src_ptr; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -643,7 +582,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, "vadd.s16 q8, q8, q9 \n" "vmovn.s16 d6, q8 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0]! \n" // store pixels "vadd.s32 q1, q1, q0 \n" "vadd.s32 q2, q2, q0 \n" @@ -665,351 +603,299 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - MEMACCESS(0) - "vst1.8 {d1[7]}, [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 - : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" - ); + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.32 {q0, q1}, [%0]! \n" - MEMACCESS(0) - "vld2.32 {q2, q3}, [%0]! \n" - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, +// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! +// 4a: 3e04 subs r6, #4 +// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! +// 50: ef64 21f4 vorr q9, q10, q10 +// 54: f942 038d vst2.32 {d16-d19}, [r2]! +// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46> + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - "vrshrn.u16 d2, q2, #1 \n" - "vrshrn.u16 d3, q3, #1 \n" - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - MEMACCESS(1) - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "mov r12, %3, lsl #2 \n" - "1: \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" - ); + asm volatile( + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "vld1.8 {d1}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d3}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d4}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d5}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d6}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" - ); + asm volatile( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); } -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ "vld1.32 {" #dn "[" #n "]}, [%6] \n" -// clang-format on -void ScaleARGBCols_NEON(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { int tmp; - const uint8* src_tmp = src_argb; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(d0, 0) - LOAD1_DATA32_LANE(d0, 1) - LOAD1_DATA32_LANE(d1, 0) - LOAD1_DATA32_LANE(d1, 1) - LOAD1_DATA32_LANE(d2, 0) - LOAD1_DATA32_LANE(d2, 1) - LOAD1_DATA32_LANE(d3, 0) - LOAD1_DATA32_LANE(d3, 1) - - MEMACCESS(0) - "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1" - ); + const uint8_t* src_tmp = src_argb; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + // clang-format on + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1"); } #undef LOAD1_DATA32_LANE -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" -// clang-format on - -void ScaleARGBFilterCols_NEON(uint8* dst_argb, - const uint8* src_argb, +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; + const uint8_t* src_tmp = src_argb; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -1045,7 +931,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, "vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d1, q12, #7 \n" - MEMACCESS(0) "vst1.32 {d0, d1}, [%0]! \n" // store pixels "vadd.s32 q8, q8, q9 \n" "subs %2, %2, #4 \n" // 4 processed per loop diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc index 1ff5f2bf..0a7b80ce 100644 --- a/files/source/scale_neon64.cc +++ b/files/source/scale_neon64.cc @@ -21,610 +21,544 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, +void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc - "subs %w2, %w2, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // add adjacent - "uaddlp v1.8h, v1.16b \n" - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #1 \n" - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc - MEMACCESS(1) - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, +void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - MEMACCESS(3) - "ld1 {v1.16b}, [%2], #16 \n" - MEMACCESS(4) - "ld1 {v2.16b}, [%3], #16 \n" - MEMACCESS(5) - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - MEMACCESS(1) - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" - - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" - - // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" - - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", - "v20", "memory", "cc" - ); + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" - - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" - ); + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); } -static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; -static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, - 34, 6, 22, 35, 0, 0, 0, 0}; -static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; -static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - MEMACCESS(3) - "ld1 {v3.16b}, [%3] \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(5) - "ld1 {v29.8h}, [%5] \n" - MEMACCESS(6) - "ld1 {v30.16b}, [%6] \n" - MEMACCESS(7) - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - MEMACCESS(4) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" - - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" - - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", - "v30", "v31", "memory", "cc" - ); + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + + // Align for table lookup, vtbl requires registers to be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(4) - "ld1 {v30.8h}, [%4] \n" - MEMACCESS(5) - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v30", "v31", "memory", "cc" - ); + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint16* dst_ptr, - int src_width, - int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - MEMACCESS(2) - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List +// Add a row of bytes to a row of shorts. Used for box filter. +// Reads 16 bytes and accumulates to 16 shorts at a time. +void ScaleAddRow_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( + "1: \n" + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2" // Clobber List ); } -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ "ld2 {v4.b, v5.b}[" #n "], [%6] \n" -// clang-format on // The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8)((int)(a) + +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64)x; - int64 dx64 = (int64)dx; + const uint8_t* src_tmp = src_ptr; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -661,7 +595,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" - MEMACCESS(0) "st1 {v4.8b}, [%0], #8 \n" // store pixels "add v1.4s, v1.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n" @@ -669,7 +602,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -683,357 +616,300 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - MEMACCESS(0) - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction),// %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" - ); + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS (0) - "ld2 {v0.4s, v1.4s}, [%0], #32 \n" - MEMACCESS (0) - "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS (1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - MEMACCESS (1) - "st1 {v3.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (dst), // %1 - "+r" (dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS (0) - // load 8 ARGB pixels. - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #1 \n" - "rshrn v2.8b, v2.8h, #1 \n" - "rshrn v3.8b, v3.8h, #1 \n" - MEMACCESS (1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst, + uint8_t* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS (0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - MEMACCESS (1) - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - MEMACCESS (2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (src_stride), // %1 - "+r" (dst), // %2 - "+r" (dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" - ); + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64)(src_stepx * 4)) // %3 - : "memory", "cc", "v0" - ); + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64_t)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. // TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { - asm volatile ( - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "ld1 {v1.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v3.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v5.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64_t)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld1 {" #vn ".s}[" #n "], [%6] \n" -// clang-format on - -void ScaleARGBCols_NEON(uint8* dst_argb, - const uint8* src_argb, +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" + +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64)x; - int64 dx64 = (int64)dx; - int64 tmp64; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) - - MEMACCESS(0) - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1" - ); + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + int64_t tmp64; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + // clang-format on + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); } #undef LOAD1_DATA32_LANE -// clang-format off // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(vn1, vn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" -// clang-format on -void ScaleARGBFilterCols_NEON(uint8* dst_argb, - const uint8* src_argb, +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64)x; - int64 dx64 = (int64)dx; + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -1070,14 +946,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" - MEMACCESS(0) "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -1090,6 +965,85 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, #undef LOAD2_DATA32_LANE +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// Actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" + + "1: \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List + ); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc index 0c5b3a1e..c5fc86f3 100644 --- a/files/source/scale_win.cc +++ b/files/source/scale_win.cc @@ -17,80 +17,81 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -115,9 +116,9 @@ __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr, } // Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -149,9 +150,9 @@ __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, } // Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -194,9 +195,9 @@ __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -223,9 +224,9 @@ __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, } // Blends 64x1 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -261,9 +262,9 @@ __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, // For rounding, average = (sum + 2) / 4 // becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -308,9 +309,9 @@ __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -340,9 +341,9 @@ __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr, } // Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -399,9 +400,9 @@ __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. -__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -434,9 +435,9 @@ __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr, } // Blends 64x4 rectangle to 16x1. -__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -498,9 +499,9 @@ __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -546,9 +547,9 @@ __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -603,9 +604,9 @@ __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -665,9 +666,9 @@ __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -697,9 +698,9 @@ __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -762,9 +763,9 @@ __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, + uint8_t* dst_ptr, int dst_width) { __asm { push esi @@ -807,8 +808,8 @@ __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, - uint16* dst_ptr, +__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, int src_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -816,7 +817,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 - // sum rows + // sum rows xloop: movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] @@ -838,8 +839,8 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, - uint16* dst_ptr, +__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, int src_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -847,7 +848,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 - // sum rows + // sum rows xloop: vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] @@ -870,16 +871,16 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, +__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { @@ -939,7 +940,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. @@ -964,8 +965,8 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, - const uint8* src_ptr, +__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, int dst_width, int x, int dx) { @@ -991,9 +992,9 @@ __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb @@ -1016,9 +1017,9 @@ __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb @@ -1044,9 +1045,9 @@ __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { __asm { push esi @@ -1078,10 +1079,10 @@ __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { __asm { push ebx @@ -1115,10 +1116,10 @@ __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, } // Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, + uint8_t* dst_argb, int dst_width) { __asm { push ebx @@ -1163,8 +1164,8 @@ __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, - const uint8* src_argb, +__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { @@ -1194,7 +1195,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, sub ecx, 4 jl xloop49 - // 4 Pixel loop. + // 4 Pixel loop. xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels @@ -1218,7 +1219,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, test ecx, 2 je xloop29 - // 2 Pixels. + // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. @@ -1231,7 +1232,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, test ecx, 1 je xloop99 - // 1 Pixels. + // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: @@ -1246,18 +1247,18 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, // TODO(fbarchard): Port to Neon // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { +static const uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { +static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, - const uint8* src_argb, +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { @@ -1309,7 +1310,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels pshufb xmm2, xmm5 // 00000000 @@ -1329,8 +1330,8 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, - const uint8* src_argb, +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, int dst_width, int x, int dx) { diff --git a/files/source/video_common.cc b/files/source/video_common.cc index 3e9c6a29..92384c05 100644 --- a/files/source/video_common.cc +++ b/files/source/video_common.cc @@ -15,14 +15,13 @@ namespace libyuv { extern "C" { #endif -#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) - struct FourCCAliasEntry { - uint32 alias; - uint32 canonical; + uint32_t alias; + uint32_t canonical; }; -static const struct FourCCAliasEntry kFourCCAliases[] = { +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { {FOURCC_IYUV, FOURCC_I420}, {FOURCC_YU12, FOURCC_I420}, {FOURCC_YU16, FOURCC_I422}, @@ -46,9 +45,9 @@ static const struct FourCCAliasEntry kFourCCAliases[] = { // {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA LIBYUV_API -uint32 CanonicalFourCC(uint32 fourcc) { +uint32_t CanonicalFourCC(uint32_t fourcc) { int i; - for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + for (i = 0; i < NUM_ALIASES; ++i) { if (kFourCCAliases[i].alias == fourcc) { return kFourCCAliases[i].canonical; } |