diff options
Diffstat (limited to 'files/source')
50 files changed, 40530 insertions, 16723 deletions
diff --git a/files/source/compare.cc b/files/source/compare.cc index 5aa3a4db..d4713b60 100644 --- a/files/source/compare.cc +++ b/files/source/compare.cc @@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. return FOURCC_BGRA; } - if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA. return FOURCC_ARGB; } if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. return FOURCC_BGRA; } - if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255. + if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255. return FOURCC_ARGB; } argb += 8; @@ -154,11 +154,6 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_MSA; } #endif -#if defined(HAS_HAMMINGDISTANCE_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - HammingDistance = HammingDistance_MMI; - } -#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : diff) @@ -216,11 +211,6 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_MSA; } #endif -#if defined(HAS_SUMSQUAREERROR_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SumSquareError = SumSquareError_MMI; - } -#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc index d4b170ad..d1cab8d2 100644 --- a/files/source/compare_common.cc +++ b/files/source/compare_common.cc @@ -17,36 +17,6 @@ namespace libyuv { extern "C" { #endif -#if ORIGINAL_OPT -uint32_t HammingDistance_C1(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count; ++i) { - int x = src_a[i] ^ src_b[i]; - if (x & 1) - ++diff; - if (x & 2) - ++diff; - if (x & 4) - ++diff; - if (x & 8) - ++diff; - if (x & 16) - ++diff; - if (x & 32) - ++diff; - if (x & 64) - ++diff; - if (x & 128) - ++diff; - } - return diff; -} -#endif - // Hakmem method for hamming distance. uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc index 676527c1..b834b42a 100644 --- a/files/source/compare_gcc.cc +++ b/files/source/compare_gcc.cc @@ -19,8 +19,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(__x86_64__) uint32_t HammingDistance_SSE42(const uint8_t* src_a, @@ -29,38 +28,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, uint64_t diff = 0u; asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" // Process 32 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -80,26 +79,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, // Process 16 bytes per loop. LABELALIGN "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -121,46 +120,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, uint32_t diff = 0u; asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -182,40 +181,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, asm volatile( "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" - "vpxor %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm1,%%ymm1,%%ymm1 \n" - "sub %0,%1 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqa (%0),%%ymm4 \n" - "vmovdqa 0x20(%0), %%ymm5 \n" - "vpxor (%0,%1), %%ymm4, %%ymm4 \n" - "vpand %%ymm2,%%ymm4,%%ymm6 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" - "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" - "add $0x40,%0 \n" - "vpand %%ymm2,%%ymm4,%%ymm5 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" - "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" - "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" - "vpermq $0xb1,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xaa,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vmovd %%xmm0, %3 \n" + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -234,34 +233,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -301,44 +300,44 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" : "+r"(src), // %0 "+r"(count), // %1 "+rm"(seed), // %2 diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc index 2a2181e0..afdd6012 100644 --- a/files/source/compare_neon.cc +++ b/files/source/compare_neon.cc @@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, uint32_t diff; asm volatile( - "vmov.u16 q4, #0 \n" // accumulator + "vmov.u16 q4, #0 \n" // accumulator "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : @@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc index 6e8f672a..70fb9b91 100644 --- a/files/source/compare_neon64.cc +++ b/files/source/compare_neon64.cc @@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, int count) { uint32_t diff; asm volatile( - "movi v4.8h, #0 \n" + "movi v4.8h, #0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : : "cc", "v0", "v1", "v2", "v3", "v4"); @@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, int count) { uint32_t sse; asm volatile( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc index d57d3d9d..9bb27f1d 100644 --- a/files/source/compare_win.cc +++ b/files/source/compare_win.cc @@ -22,8 +22,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, @@ -77,8 +78,7 @@ __declspec(naked) uint32_t } } -// Visual C 2012 required for AVX2. -#if _MSC_VER >= 1700 +#ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable : 4752) __declspec(naked) uint32_t @@ -118,7 +118,7 @@ __declspec(naked) uint32_t ret } } -#endif // _MSC_VER >= 1700 +#endif // HAS_SUMSQUAREERROR_AVX2 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { @@ -196,7 +196,7 @@ __declspec(naked) uint32_t } // Visual C 2012 required for AVX2. -#if _MSC_VER >= 1700 +#ifdef HAS_HASHDJB2_AVX2 __declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { @@ -231,7 +231,7 @@ __declspec(naked) uint32_t ret } } -#endif // _MSC_VER >= 1700 +#endif // HAS_HASHDJB2_AVX2 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) diff --git a/files/source/convert.cc b/files/source/convert.cc index 614fa482..7178580f 100644 --- a/files/source/convert.cc +++ b/files/source/convert.cc @@ -15,7 +15,9 @@ #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/row.h" -#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/scale_row.h" // For FixedDiv +#include "libyuv/scale_uv.h" // For UVScale() #ifdef __cplusplus namespace libyuv { @@ -48,7 +50,7 @@ static int I4xxToI420(const uint8_t* src_y, const int dst_y_height = Abs(src_y_height); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); - if (src_uv_width == 0 || src_uv_height == 0) { + if (src_uv_width <= 0 || src_uv_height == 0) { return -1; } if (dst_y) { @@ -82,7 +84,8 @@ int I420Copy(const uint8_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -124,7 +127,8 @@ int I010Copy(const uint16_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -148,6 +152,53 @@ int I010Copy(const uint16_t* src_y, return 0; } +static int Planar16bitTo8bit(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + int scale = 1 << (24 - depth); + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = -uv_height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (uv_height - 1) * src_stride_u; + src_v = src_v + (uv_height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width, + uv_height); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width, + uv_height); + return 0; +} + // Convert 10 bit YUV to 8 bit. LIBYUV_API int I010ToI420(const uint16_t* src_y, @@ -164,34 +215,344 @@ int I010ToI420(const uint16_t* src_y, int dst_stride_v, int width, int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 10); +} + +LIBYUV_API +int I210ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + const int depth = 10; + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - // Convert Y plane. - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, - height); - // Convert UV planes. - Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, - halfheight); - Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, - halfheight); + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + const int dy = FixedDiv(height, uv_height); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + } return 0; } +LIBYUV_API +int I210ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 10); +} + +LIBYUV_API +int I410ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 10); +} + +LIBYUV_API +int I012ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 12); +} + +LIBYUV_API +int I212ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 12); +} + +LIBYUV_API +int I412ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 12); +} + +// Any Ix10 To I010 format with mirroring. +static int Ix10ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y) { + const int dst_y_width = Abs(width); + const int dst_y_height = Abs(height); + const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (width <= 0 || height == 0) { + return -1; + } + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + dst_y_width, dst_y_height, kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +LIBYUV_API +int I410ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 0, 0); +} + +LIBYUV_API +int I210ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 1, 0); +} + +// Any I[420]1[02] to P[420]1[02] format with mirroring. +static int IxxxToPxxx(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if (width <= 0 || height == 0) { + return -1; + } + + ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + depth); + MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, uv_width, uv_height, depth); + return 0; +} + +LIBYUV_API +int I010ToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 10); +} + +LIBYUV_API +int I210ToP210(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 10); +} + +LIBYUV_API +int I012ToP012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 12); +} + +LIBYUV_API +int I212ToP212(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 12); +} + // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API @@ -215,6 +576,48 @@ int I422ToI420(const uint8_t* src_y, dst_v, dst_stride_v, width, height, src_uv_width, height); } +LIBYUV_API +int I422ToI210(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + height); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + height); + return 0; +} + // TODO(fbarchard): Implement row conversion. LIBYUV_API int I422ToNV21(const uint8_t* src_y, @@ -256,6 +659,60 @@ int I422ToNV21(const uint8_t* src_y, return 0; } +LIBYUV_API +int MM21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0) { + return -1; + } + + int sign = height < 0 ? -1 : 1; + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1, + (height + sign) / 2, 16); + + return 0; +} + +LIBYUV_API +int MM21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int sign = height < 0 ? -1 : 1; + + if (!src_uv || !dst_u || !dst_v || width <= 0) { + return -1; + } + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16); + + return 0; +} + #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height @@ -328,11 +785,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow = MergeUVRow_MMI; +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_LSX; } } #endif @@ -368,11 +825,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -426,9 +883,8 @@ int I444ToI420(const uint8_t* src_y, dst_v, dst_stride_v, width, height, width, height); } -// TODO(fbarchard): Implement row conversion. LIBYUV_API -int I444ToNV21(const uint8_t* src_y, +int I444ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, @@ -436,16 +892,16 @@ int I444ToNV21(const uint8_t* src_y, int src_stride_v, uint8_t* dst_y, int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, + uint8_t* dst_uv, + int dst_stride_uv, int width, int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; @@ -453,19 +909,32 @@ int I444ToNV21(const uint8_t* src_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - // Allocate u and v buffers - align_buffer_64(plane_u, halfwidth * halfheight * 2); - uint8_t* plane_v = plane_u + halfwidth * halfheight; - - I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, - height); - MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, - halfwidth, halfheight); - free_aligned_buffer_64(plane_u); + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, width, height); return 0; } +LIBYUV_API +int I444ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); +} + // I400 is greyscale typically used in MJPG LIBYUV_API int I400ToI420(const uint8_t* src_y, @@ -527,70 +996,21 @@ int I400ToNV21(const uint8_t* src_y, return 0; } -static void CopyPlane2(const uint8_t* src, - int src_stride_0, - int src_stride_1, - uint8_t* dst, - int dst_stride, - int width, - int height) { - int y; - void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; - } -#endif - - // Copy plane - for (y = 0; y < height - 1; y += 2) { - CopyRow(src, dst, width); - CopyRow(src + src_stride_0, dst + dst_stride, width); - src += src_stride_0 + src_stride_1; - dst += dst_stride * 2; - } - if (height & 1) { - CopyRow(src, dst, width); - } -} - -// Support converting from FOURCC_M420 -// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for -// easy conversion to I420. -// M420 format description: -// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. -// Chroma is half width / half height. (420) -// src_stride_m420 is row planar. Normally this will be the width in pixels. -// The UV plane is half width, but 2 values, so src_stride_m420 applies to -// this as well as the two Y planes. -static int X420ToI420(const uint8_t* src_y, - int src_stride_y0, - int src_stride_y1, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { +// Convert NV12 to I420. +// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm. +LIBYUV_API +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { @@ -600,21 +1020,16 @@ static int X420ToI420(const uint8_t* src_y, if (height < 0) { height = -height; halfheight = (height + 1) >> 1; - if (dst_y) { - dst_y = dst_y + (height - 1) * dst_stride_y; - } - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; } // Coalesce rows. - if (src_stride_y0 == width && src_stride_y1 == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; - src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + src_stride_y = dst_stride_y = 0; } // Coalesce rows. if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && @@ -625,12 +1040,7 @@ static int X420ToI420(const uint8_t* src_y, } if (dst_y) { - if (src_stride_y0 == src_stride_y1) { - CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); - } else { - CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, - width, height); - } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Split UV plane - NV12 / NV21 @@ -640,12 +1050,12 @@ static int X420ToI420(const uint8_t* src_y, return 0; } -// Convert NV12 to I420. +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV12ToI420(const uint8_t* src_y, +int NV21ToI420(const uint8_t* src_y, int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, + const uint8_t* src_vu, + int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, @@ -654,46 +1064,107 @@ int NV12ToI420(const uint8_t* src_y, int dst_stride_v, int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, - dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, width, height); + return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, + width, height); } -// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV21ToI420(const uint8_t* src_y, +int NV12ToNV24(const uint8_t* src_y, int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, + const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, - dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, - dst_stride_u, width, height); + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; } -// Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8_t* src_m420, - int src_stride_m420, +int NV16ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, uint8_t* dst_y, int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, int width, int height) { - return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, - dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, - width, height); + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int P010ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int P210ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; } // Convert YUY2 to I420. @@ -750,7 +1221,7 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MSA) +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToUVRow = YUY2ToUVRow_Any_MSA; @@ -760,15 +1231,13 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - YUY2ToUVRow = YUY2ToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_MMI; - } +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUVRow = YUY2ToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUVRow = YUY2ToUVRow_LASX; } } #endif @@ -853,13 +1322,13 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToYRow = UYVYToYRow_Any_MMI; - UYVYToUVRow = UYVYToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_MMI; - UYVYToUVRow = UYVYToUVRow_MMI; +#if defined(HAS_UYVYTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUVRow = UYVYToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUVRow = UYVYToUVRow_LASX; } } #endif @@ -1045,30 +1514,10 @@ int ARGBToI420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1081,35 +1530,57 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; + ARGBToYRow = ARGBToYRow_AVX2; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif @@ -1170,7 +1641,7 @@ int BGRAToI420(const uint8_t* src_bgra, #if defined(HAS_BGRATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { BGRAToYRow = BGRAToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { BGRAToYRow = BGRAToYRow_NEON; } } @@ -1183,35 +1654,23 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif -#if defined(HAS_BGRATOYROW_MSA) +#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { BGRAToYRow = BGRAToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - BGRAToYRow = BGRAToYRow_MSA; - } - } -#endif -#if defined(HAS_BGRATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { BGRAToUVRow = BGRAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; BGRAToUVRow = BGRAToUVRow_MSA; } } #endif -#if defined(HAS_BGRATOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BGRAToYRow = BGRAToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - BGRAToYRow = BGRAToYRow_MMI; - } - } -#endif -#if defined(HAS_BGRATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BGRAToUVRow = BGRAToUVRow_Any_MMI; +#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + BGRAToYRow = BGRAToYRow_Any_LSX; + BGRAToUVRow = BGRAToUVRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_MMI; + BGRAToYRow = BGRAToYRow_LSX; + BGRAToUVRow = BGRAToUVRow_LSX; } } #endif @@ -1259,20 +1718,42 @@ int ABGRToI420(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) +#if defined(HAS_ABGRTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; ABGRToYRow = ABGRToYRow_SSSE3; } } #endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_NEON; } } @@ -1285,35 +1766,23 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_MSA) +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; ABGRToUVRow = ABGRToUVRow_MSA; } } #endif -#if defined(HAS_ABGRTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToYRow = ABGRToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_MMI; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToUVRow = ABGRToUVRow_Any_MMI; +#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + ABGRToUVRow = ABGRToUVRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; + ABGRToYRow = ABGRToYRow_LSX; + ABGRToUVRow = ABGRToUVRow_LSX; } } #endif @@ -1361,20 +1830,26 @@ int RGBAToI420(const uint8_t* src_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) +#if defined(HAS_RGBATOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToUVRow = RGBAToUVRow_Any_SSSE3; RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; RGBAToYRow = RGBAToYRow_SSSE3; } } #endif +#if defined(HAS_RGBATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + } + } +#endif #if defined(HAS_RGBATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYRow = RGBAToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGBAToYRow = RGBAToYRow_NEON; } } @@ -1387,35 +1862,23 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif -#if defined(HAS_RGBATOYROW_MSA) +#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToYRow = RGBAToYRow_MSA; - } - } -#endif -#if defined(HAS_RGBATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { RGBAToUVRow = RGBAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; RGBAToUVRow = RGBAToUVRow_MSA; } } #endif -#if defined(HAS_RGBATOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGBAToYRow = RGBAToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGBAToYRow = RGBAToYRow_MMI; - } - } -#endif -#if defined(HAS_RGBATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGBAToUVRow = RGBAToUVRow_Any_MMI; +#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGBAToYRow = RGBAToYRow_Any_LSX; + RGBAToUVRow = RGBAToUVRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_MMI; + RGBAToYRow = RGBAToYRow_LSX; + RGBAToUVRow = RGBAToUVRow_LSX; } } #endif @@ -1436,6 +1899,12 @@ int RGBAToI420(const uint8_t* src_rgba, return 0; } +// Enabled if 1 pass is available +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ + defined(HAS_RGB24TOYROW_LSX)) +#define HAS_RGB24TOYROW +#endif + // Convert RGB24 to I420. LIBYUV_API int RGB24ToI420(const uint8_t* src_rgb24, @@ -1449,8 +1918,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) +#if defined(HAS_RGB24TOYROW) void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVRow_C; @@ -1475,19 +1943,20 @@ int RGB24ToI420(const uint8_t* src_rgb24, src_stride_rgb24 = -src_stride_rgb24; } +#if defined(HAS_RGB24TOYROW) + // Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYROW_NEON) +#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYRow = RGB24ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } + RGB24ToUVRow = RGB24ToUVRow_NEON; } } -#elif defined(HAS_RGB24TOYROW_MSA) +#endif +#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToUVRow = RGB24ToUVRow_Any_MSA; RGB24ToYRow = RGB24ToYRow_Any_MSA; @@ -1496,19 +1965,31 @@ int RGB24ToI420(const uint8_t* src_rgb24, RGB24ToUVRow = RGB24ToUVRow_MSA; } } -#elif defined(HAS_RGB24TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToUVRow = RGB24ToUVRow_Any_MMI; - RGB24ToYRow = RGB24ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYRow = RGB24ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_MMI; - } +#endif +#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LSX; + RGB24ToYRow = RGB24ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_LSX; + RGB24ToUVRow = RGB24ToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LASX; + RGB24ToYRow = RGB24ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYRow = RGB24ToYRow_LASX; + RGB24ToUVRow = RGB24ToUVRow_LASX; } } +#endif + // Other platforms do intermediate conversion from RGB24 to ARGB. -#else +#else // HAS_RGB24TOYROW + #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1517,39 +1998,49 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } #endif +#endif // HAS_RGB24TOYROW { -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) +#if !defined(HAS_RGB24TOYROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) +#if defined(HAS_RGB24TOYROW) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1566,8 +2057,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) +#if defined(HAS_RGB24TOYROW) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else @@ -1576,15 +2066,19 @@ int RGB24ToI420(const uint8_t* src_rgb24, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) +#if !defined(HAS_RGB24TOYROW) free_aligned_buffer_64(row); #endif } return 0; } +#undef HAS_RGB24TOYROW + +// Enabled if 1 pass is available +#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) +#define HAS_RGB24TOYJROW +#endif -// TODO(fbarchard): Use Matrix version to implement I420 and J420. // Convert RGB24 to J420. LIBYUV_API int RGB24ToJ420(const uint8_t* src_rgb24, @@ -1598,8 +2092,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if defined(HAS_RGB24TOYJROW) void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVJRow_C; @@ -1624,19 +2117,20 @@ int RGB24ToJ420(const uint8_t* src_rgb24, src_stride_rgb24 = -src_stride_rgb24; } +#if defined(HAS_RGB24TOYJROW) + // Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYJROW_NEON) +#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVJRow = RGB24ToUVJRow_NEON; - } + RGB24ToUVJRow = RGB24ToUVJRow_NEON; } } -#elif defined(HAS_RGB24TOYJROW_MSA) +#endif +#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; RGB24ToYJRow = RGB24ToYJRow_Any_MSA; @@ -1645,19 +2139,11 @@ int RGB24ToJ420(const uint8_t* src_rgb24, RGB24ToUVJRow = RGB24ToUVJRow_MSA; } } -#elif defined(HAS_RGB24TOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI; - RGB24ToYJRow = RGB24ToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVJRow = RGB24ToUVJRow_MMI; - } - } - } +#endif + // Other platforms do intermediate conversion from RGB24 to ARGB. -#else +#else // HAS_RGB24TOYJROW + #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1666,39 +2152,49 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) +#if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVJRow = ARGBToUVJRow_AVX2; ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } #endif +#endif // HAS_RGB24TOYJROW { -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if !defined(HAS_RGB24TOYJROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if defined(HAS_RGB24TOYJROW) RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1715,8 +2211,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if defined(HAS_RGB24TOYJROW) RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); #else @@ -1725,13 +2220,19 @@ int RGB24ToJ420(const uint8_t* src_rgb24, ARGBToYJRow(row, dst_y, width); #endif } -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) +#if !defined(HAS_RGB24TOYJROW) free_aligned_buffer_64(row); #endif } return 0; } +#undef HAS_RGB24TOYJROW + +// Enabled if 1 pass is available +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ + defined(HAS_RAWTOYROW_LSX)) +#define HAS_RAWTOYROW +#endif // Convert RAW to I420. LIBYUV_API @@ -1746,8 +2247,7 @@ int RAWToI420(const uint8_t* src_raw, int width, int height) { int y; -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if defined(HAS_RAWTOYROW) void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = @@ -1771,19 +2271,20 @@ int RAWToI420(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } +#if defined(HAS_RAWTOYROW) + // Neon version does direct RAW to YUV. -#if defined(HAS_RAWTOYROW_NEON) +#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToUVRow = RAWToUVRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RAWToYRow = RAWToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } + RAWToUVRow = RAWToUVRow_NEON; } } -#elif defined(HAS_RAWTOYROW_MSA) +#endif +#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToUVRow = RAWToUVRow_Any_MSA; RAWToYRow = RAWToYRow_Any_MSA; @@ -1792,19 +2293,31 @@ int RAWToI420(const uint8_t* src_raw, RAWToUVRow = RAWToUVRow_MSA; } } -#elif defined(HAS_RAWTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToUVRow = RAWToUVRow_Any_MMI; - RAWToYRow = RAWToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RAWToYRow = RAWToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_MMI; - } +#endif +#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToUVRow = RAWToUVRow_Any_LSX; + RAWToYRow = RAWToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_LSX; + RAWToUVRow = RAWToUVRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToUVRow = RAWToUVRow_Any_LASX; + RAWToYRow = RAWToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYRow = RAWToYRow_LASX; + RAWToUVRow = RAWToUVRow_LASX; } } +#endif + // Other platforms do intermediate conversion from RAW to ARGB. -#else +#else // HAS_RAWTOYROW + #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -1813,39 +2326,49 @@ int RAWToI420(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYROW { -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if !defined(HAS_RAWTOYROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if defined(HAS_RAWTOYROW) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); @@ -1862,8 +2385,7 @@ int RAWToI420(const uint8_t* src_raw, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if defined(HAS_RAWTOYROW) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else @@ -1872,13 +2394,167 @@ int RAWToI420(const uint8_t* src_raw, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) +#if !defined(HAS_RAWTOYROW) free_aligned_buffer_64(row); #endif } return 0; } +#undef HAS_RAWTOYROW + +// Enabled if 1 pass is available +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#define HAS_RAWTOYJROW +#endif + +// Convert RAW to J420. +LIBYUV_API +int RAWToJ420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYJROW + + { +#if !defined(HAS_RAWTOYJROW) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYJROW) + free_aligned_buffer_64(row); +#endif + } + return 0; +} +#undef HAS_RAWTOYJROW // Convert RGB565 to I420. LIBYUV_API @@ -1894,7 +2570,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, int height) { int y; #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB565ToUVRow_C; @@ -1931,7 +2607,10 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } } -#elif defined(HAS_RGB565TOYROW_MSA) +// MSA version does direct RGB565 to YUV. +#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \ + defined(HAS_RGB565TOYROW_LASX)) +#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToUVRow = RGB565ToUVRow_Any_MSA; RGB565ToYRow = RGB565ToYRow_Any_MSA; @@ -1940,17 +2619,27 @@ int RGB565ToI420(const uint8_t* src_rgb565, RGB565ToUVRow = RGB565ToUVRow_MSA; } } -#elif defined(HAS_RGB565TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB565ToUVRow = RGB565ToUVRow_Any_MMI; - RGB565ToYRow = RGB565ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB565ToYRow = RGB565ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB565ToUVRow = RGB565ToUVRow_MMI; - } +#endif +#if defined(HAS_RGB565TOYROW_LSX) && defined(HAS_RGB565TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB565ToUVRow = RGB565ToUVRow_Any_LSX; + RGB565ToYRow = RGB565ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_LSX; + RGB565ToUVRow = RGB565ToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToUVRow = RGB565ToUVRow_Any_LASX; + RGB565ToYRow = RGB565ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToYRow = RGB565ToYRow_LASX; + RGB565ToUVRow = RGB565ToUVRow_LASX; } } +#endif // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -1969,37 +2658,49 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #endif { #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -2017,7 +2718,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, } if (height & 1) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -2027,7 +2728,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, #endif } #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) free_aligned_buffer_64(row); #endif } @@ -2048,7 +2749,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, int height) { int y; #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB1555ToUVRow_C; @@ -2086,7 +2787,10 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } } -#elif defined(HAS_ARGB1555TOYROW_MSA) +// MSA version does direct ARGB1555 to YUV. +#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \ + defined(HAS_ARGB1555TOYROW_LASX)) +#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; @@ -2095,17 +2799,27 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; } } -#elif defined(HAS_ARGB1555TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; - ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToYRow = ARGB1555ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_MMI; - } +#endif +#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX; + ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_LSX; + ARGB1555ToUVRow = ARGB1555ToUVRow_LSX; } } +#endif +#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX; + ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToYRow = ARGB1555ToYRow_LASX; + ARGB1555ToUVRow = ARGB1555ToUVRow_LASX; + } + } +#endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -2124,30 +2838,42 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #endif { #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); @@ -2155,7 +2881,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -2175,7 +2901,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } if (height & 1) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -2185,7 +2911,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif } #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) free_aligned_buffer_64(row); #endif } @@ -2205,7 +2931,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, int width, int height) { int y; -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) +#if defined(HAS_ARGB4444TOYROW_NEON) void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB4444ToUVRow_C; @@ -2243,17 +2969,6 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } } -#elif defined(HAS_ARGB4444TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; - ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToYRow = ARGB4444ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_MMI; - } - } - } // Other platforms do intermediate conversion from ARGB4444 to ARGB. #else #if defined(HAS_ARGB4444TOARGBROW_SSE2) @@ -2280,27 +2995,55 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGB4444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif -#if defined(HAS_ARGBTOYROW_MSA) +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVRow = ARGBToUVRow_Any_MSA; ARGBToYRow = ARGBToYRow_Any_MSA; @@ -2312,29 +3055,27 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif #endif { -#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) +#if !(defined(HAS_ARGB4444TOYROW_NEON)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) +#if defined(HAS_ARGB4444TOYROW_NEON) ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, @@ -2353,7 +3094,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, dst_v += dst_stride_v; } if (height & 1) { -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) +#if defined(HAS_ARGB4444TOYROW_NEON) ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); #else @@ -2362,7 +3103,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, ARGBToYRow(row, dst_y, width); #endif } -#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) +#if !(defined(HAS_ARGB4444TOYROW_NEON)) free_aligned_buffer_64(row); #endif } @@ -2378,125 +3119,129 @@ int RGB24ToJ400(const uint8_t* src_rgb24, int width, int height) { int y; -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = RGB24ToYJRow_C; -#else - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; -#endif if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } - -// Neon version does direct RGB24 to YUV. + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_yj = 0; + } +#if defined(HAS_RGB24TOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_AVX2; + } + } +#endif #if defined(HAS_RGB24TOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_NEON; } } -#elif defined(HAS_RGB24TOYJROW_MSA) +#endif +#if defined(HAS_RGB24TOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToYJRow = RGB24ToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_MSA; } } -#elif defined(HAS_RGB24TOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToYJRow = RGB24ToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_MMI; - } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToYJRow(src_rgb24, dst_yj, width); + src_rgb24 += src_stride_rgb24; + dst_yj += dst_stride_yj; } -// Other platforms do intermediate conversion from RGB24 to ARGB. -#else -#if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; - } + return 0; +} + +// Convert RAW to J400. +LIBYUV_API +int RAWToJ400(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = + RAWToYJRow_C; + if (!src_raw || !dst_yj || width <= 0 || height == 0) { + return -1; } -#endif -#if defined(HAS_ARGBTOYJROW_SSSE3) + + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_raw = dst_stride_yj = 0; + } + +#if defined(HAS_RAWTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + RAWToYJRow = RAWToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; + RAWToYJRow = RAWToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) +#if defined(HAS_RAWTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; + RAWToYJRow = RAWToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; + RAWToYJRow = RAWToYJRow_AVX2; } } #endif -#endif - - { -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToYJRow(src_rgb24, dst_yj, width); - RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToYJRow(row, dst_yj, width); - ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width); -#endif - src_rgb24 += src_stride_rgb24 * 2; - dst_yj += dst_stride_yj * 2; +#if defined(HAS_RAWTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; } - if (height & 1) { -#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToYJRow(src_rgb24, dst_yj, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToYJRow(row, dst_yj, width); + } #endif +#if defined(HAS_RAWTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; } -#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI)) - free_aligned_buffer_64(row); -#endif } - return 0; -} +#endif -static void SplitPixels(const uint8_t* src_u, - int src_pixel_stride_uv, - uint8_t* dst_u, - int width) { - int i; - for (i = 0; i < width; ++i) { - *dst_u = *src_u; - ++dst_u; - src_u += src_pixel_stride_uv; + for (y = 0; y < height; ++y) { + RAWToYJRow(src_raw, dst_yj, width); + src_raw += src_stride_raw; + dst_yj += dst_stride_yj; } + return 0; } // Convert Android420 to I420. @@ -2516,58 +3261,10 @@ int Android420ToI420(const uint8_t* src_y, int dst_stride_v, int width, int height) { - int y; - const ptrdiff_t vu_off = src_v - src_u; - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // Copy UV planes as is - I420 - if (src_pixel_stride_uv == 1) { - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; - // Split UV planes - NV21 - } - if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { - SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, - halfwidth, halfheight); - return 0; - // Split UV planes - NV12 - } - if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { - SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, - halfwidth, halfheight); - return 0; - } - - for (y = 0; y < halfheight; ++y) { - SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); - SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); - src_u += src_stride_u; - src_v += src_stride_v; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; + return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height, kRotate0); } #ifdef __cplusplus diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc index 54050333..71ef8c10 100644 --- a/files/source/convert_argb.cc +++ b/files/source/convert_argb.cc @@ -7,7 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ - #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" @@ -17,6 +16,7 @@ #include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. #include "libyuv/rotate_argb.h" #include "libyuv/row.h" +#include "libyuv/scale_row.h" // For ScaleRowUp2_Linear and ScaleRowUp2_Bilinear #include "libyuv/video_common.h" #ifdef __cplusplus @@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb, return 0; } -// Convert I420 to ARGB with matrix -static int I420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert I420 to ARGB with matrix. +LIBYUV_API +int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, @@ -89,6 +90,15 @@ static int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; @@ -105,6 +115,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -226,18 +244,55 @@ int H420ToABGR(const uint8_t* src_y, width, height); } -// Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert U420 to ARGB. +LIBYUV_API +int U420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U420 to ABGR. +LIBYUV_API +int U420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert I422 to ARGB with matrix. +LIBYUV_API +int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, @@ -275,6 +330,15 @@ static int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; @@ -291,6 +355,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -410,20 +482,286 @@ int H422ToABGR(const uint8_t* src_y, width, height); } -// Convert 10 bit YUV to ARGB with matrix +// Convert U422 to ARGB. +LIBYUV_API +int U422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U422 to ABGR. +LIBYUV_API +int U422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert I444 to ARGB with matrix. +LIBYUV_API +int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I444ToARGBRow = I444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_LSX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J444 to ARGB. +LIBYUV_API +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); +} + +// Convert J444 to ABGR. +LIBYUV_API +int J444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H444 to ARGB. +LIBYUV_API +int H444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H444 to ABGR. +LIBYUV_API +int H444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U444 to ARGB. +LIBYUV_API +int U444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U444 to ABGR. +LIBYUV_API +int U444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. -static int I010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { +LIBYUV_API +int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -500,6 +838,23 @@ int H010ToAR30(const uint16_t* src_y, &kYuvH709Constants, width, height); } +// Convert U010 to AR30. +LIBYUV_API +int U010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuv2020Constants, width, height); +} + // Convert I010 to AB30. LIBYUV_API int I010ToAB30(const uint16_t* src_y, @@ -534,18 +889,302 @@ int H010ToAB30(const uint16_t* src_y, &kYvuH709Constants, width, height); } -// Convert 10 bit YUV to ARGB with matrix -static int I010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert U010 to AB30. +LIBYUV_API +int U010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYuv2020Constants, width, height); +} + +// Convert 12 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I212TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToAR30Row = I212ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToAR30Row = I212ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I212TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToAR30Row = I212ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToAR30Row = I212ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert 10 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I210 to AR30. +LIBYUV_API +int I210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H210 to AR30. +LIBYUV_API +int H210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert U210 to AR30. +LIBYUV_API +int U210ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuv2020Constants, width, height); +} + +// Convert I210 to AB30. +LIBYUV_API +int I210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H210 to AB30. +LIBYUV_API +int H210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert U210 to AB30. +LIBYUV_API +int U210ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYuv2020Constants, width, height); +} + +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert 10 bit YUV to ARGB with matrix. +LIBYUV_API +int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, @@ -660,23 +1299,60 @@ int H010ToABGR(const uint16_t* src_y, width, height); } -// Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert U010 to ARGB. +LIBYUV_API +int U010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U010 to ABGR. +LIBYUV_API +int U010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, + void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = - I444ToARGBRow_C; + I212ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -686,48 +1362,79 @@ static int I444ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce rows. - if (src_stride_y == width && src_stride_u == width && src_stride_v == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if defined(HAS_I212TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + I212ToARGBRow = I212ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; + I212ToARGBRow = I212ToARGBRow_SSSE3; } } #endif -#if defined(HAS_I444TOARGBROW_AVX2) +#if defined(HAS_I212TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I444ToARGBRow = I444ToARGBRow_Any_AVX2; + I212ToARGBRow = I212ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I444ToARGBRow = I444ToARGBRow_AVX2; + I212ToARGBRow = I212ToARGBRow_AVX2; } } #endif -#if defined(HAS_I444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I444ToARGBRow = I444ToARGBRow_Any_NEON; + for (y = 0; y < height; ++y) { + I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert 10 bit 422 YUV to ARGB with matrix. +LIBYUV_API +int I210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_NEON; + I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif -#if defined(HAS_I444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I444ToARGBRow = I444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_MSA; +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; } } #endif - for (y = 0; y < height; ++y) { - I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; @@ -736,74 +1443,378 @@ static int I444ToARGBMatrix(const uint8_t* src_y, return 0; } -// Convert I444 to ARGB. +// Convert I210 to ARGB. LIBYUV_API -int I444ToARGB(const uint8_t* src_y, +int I210ToARGB(const uint16_t* src_y, int src_stride_y, - const uint8_t* src_u, + const uint16_t* src_u, int src_stride_u, - const uint8_t* src_v, + const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } -// Convert I444 to ABGR. +// Convert I210 to ABGR. LIBYUV_API -int I444ToABGR(const uint8_t* src_y, +int I210ToABGR(const uint16_t* src_y, int src_stride_y, - const uint8_t* src_u, + const uint16_t* src_u, int src_stride_u, - const uint8_t* src_v, + const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_v, + return I210ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } -// Convert J444 to ARGB. +// Convert H210 to ARGB. LIBYUV_API -int J444ToARGB(const uint8_t* src_y, +int H210ToARGB(const uint16_t* src_y, int src_stride_y, - const uint8_t* src_u, + const uint16_t* src_u, int src_stride_u, - const uint8_t* src_v, + const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, - &kYuvJPEGConstants, width, height); + &kYuvH709Constants, width, height); } -// Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { +// Convert H210 to ABGR. +LIBYUV_API +int H210ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert U210 to ARGB. +LIBYUV_API +int U210ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuv2020Constants, width, height); +} + +// Convert U210 to ABGR. +LIBYUV_API +int U210ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I210ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvu2020Constants, // Use Yvu matrix + width, height); +} + +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert I420 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { int y; void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, @@ -812,7 +1823,8 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -853,6 +1865,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LASX; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -885,14 +1905,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -911,6 +1923,242 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y, return 0; } +// Convert I422 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I422AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I444AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert I420 with Alpha to ARGB. LIBYUV_API int I420AlphaToARGB(const uint8_t* src_y, @@ -954,16 +2202,400 @@ int I420AlphaToABGR(const uint8_t* src_y, width, height, attenuate); } -// Convert I400 to ARGB. +// Convert I422 with Alpha to ARGB. LIBYUV_API -int I400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +int I422AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I422 with Alpha to ABGR. +LIBYUV_API +int I422AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I422AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I444 with Alpha to ARGB. +LIBYUV_API +int I444AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); +} + +// Convert I444 with Alpha to ABGR. +LIBYUV_API +int I444AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I444AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); +} + +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { int y; - void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I400 to ARGB with matrix. +LIBYUV_API +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1012,23 +2644,35 @@ int I400ToARGB(const uint8_t* src_y, } } #endif -#if defined(HAS_I400TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I400ToARGBRow = I400ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_MMI; +#if defined(HAS_I400TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I400ToARGBRow = I400ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_LSX; } } #endif for (y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, width); + I400ToARGBRow(src_y, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } return 0; } +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + // Convert J400 to ARGB. LIBYUV_API int J400ToARGB(const uint8_t* src_y, @@ -1087,11 +2731,11 @@ int J400ToARGB(const uint8_t* src_y, } } #endif -#if defined(HAS_J400TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - J400ToARGBRow = J400ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - J400ToARGBRow = J400ToARGBRow_MMI; +#if defined(HAS_J400TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + J400ToARGBRow = J400ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_LSX; } } #endif @@ -1115,6 +2759,10 @@ static const uvec8 kShuffleMaskABGRToARGB = { static const uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; +// Shuffle table for converting AR64 to AB64. +static const uvec8 kShuffleMaskAR64ToAB64 = { + 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; + // Convert BGRA to ARGB. LIBYUV_API int BGRAToARGB(const uint8_t* src_bgra, @@ -1124,7 +2772,7 @@ int BGRAToARGB(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). @@ -1136,7 +2784,7 @@ int ARGBToBGRA(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ABGR to ARGB. @@ -1148,7 +2796,7 @@ int ABGRToARGB(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). @@ -1160,7 +2808,7 @@ int ARGBToABGR(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert RGBA to ARGB. @@ -1172,7 +2820,19 @@ int RGBAToARGB(const uint8_t* src_rgba, int width, int height) { return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); + (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); +} + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, + (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); } // Convert RGB24 to ARGB. @@ -1225,11 +2885,19 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RGB24ToARGBRow = RGB24ToARGBRow_MMI; +#if defined(HAS_RGB24TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_LASX; } } #endif @@ -1292,11 +2960,19 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToARGBRow = RAWToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RAWToARGBRow = RAWToARGBRow_MMI; +#if defined(HAS_RAWTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToARGBRow = RAWToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; } } #endif @@ -1309,6 +2985,57 @@ int RAWToARGB(const uint8_t* src_raw, return 0; } +// Convert RAW to RGBA. +LIBYUV_API +int RAWToRGBA(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + int y; + void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) = + RAWToRGBARow_C; + if (!src_raw || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_rgba = 0; + } +#if defined(HAS_RAWTORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToRGBARow = RAWToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToRGBARow = RAWToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToRGBARow = RAWToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToRGBARow = RAWToRGBARow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToRGBARow(src_raw, dst_rgba, width); + src_raw += src_stride_raw; + dst_rgba += dst_stride_rgba; + } + return 0; +} + // Convert RGB565 to ARGB. LIBYUV_API int RGB565ToARGB(const uint8_t* src_rgb565, @@ -1367,11 +3094,19 @@ int RGB565ToARGB(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_RGB565TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RGB565ToARGBRow = RGB565ToARGBRow_MMI; +#if defined(HAS_RGB565TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB565TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToARGBRow = RGB565ToARGBRow_LASX; } } #endif @@ -1442,11 +3177,19 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, } } #endif -#if defined(HAS_ARGB1555TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI; +#if defined(HAS_ARGB1555TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB1555TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX; } } #endif @@ -1517,11 +3260,19 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGB4444TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI; +#if defined(HAS_ARGB4444TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; } } #endif @@ -1630,16 +3381,135 @@ int AR30ToAB30(const uint8_t* src_ar30, return 0; } -// Convert NV12 to ARGB with matrix -static int NV12ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AR64ToARGBRow_C; + if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_argb = 0; + } +#if defined(HAS_AR64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AR64ToARGBRow = AR64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ToARGBRow = AR64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ToARGBRow(src_ar64, dst_argb, width); + src_ar64 += src_stride_ar64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AB64ToARGBRow_C; + if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; + src_stride_ab64 = -src_stride_ab64; + } + // Coalesce rows. + if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ab64 = dst_stride_argb = 0; + } +#if defined(HAS_AB64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AB64ToARGBRow = AB64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AB64ToARGBRow = AB64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AB64ToARGBRow(src_ab64, dst_argb, width); + src_ab64 += src_stride_ab64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB with matrix. +LIBYUV_API +int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV12ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -1685,6 +3555,22 @@ static int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV12ToARGBRow = NV12ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToARGBRow = NV12ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -1697,16 +3583,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y, return 0; } -// Convert NV21 to ARGB with matrix -static int NV21ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV21 to ARGB with matrix. +LIBYUV_API +int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV21ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -1752,6 +3639,22 @@ static int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV21ToARGBRow = NV21ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV21ToARGBRow = NV21ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); @@ -1823,16 +3726,17 @@ int NV21ToABGR(const uint8_t* src_y, } // TODO(fbarchard): Consider SSSE3 2 step conversion. -// Convert NV12 to RGB24 with matrix -static int NV12ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV12 to RGB24 with matrix. +LIBYUV_API +int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV12ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -1882,16 +3786,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y, return 0; } -// Convert NV21 to RGB24 with matrix -static int NV21ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { +// Convert NV21 to RGB24 with matrix. +LIBYUV_API +int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*NV21ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, @@ -2028,6 +3933,14 @@ int NV21ToYUV24(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOYUV24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; + } + } +#endif #if defined(HAS_NV21TOYUV24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; @@ -2047,75 +3960,6 @@ int NV21ToYUV24(const uint8_t* src_y, return 0; } -// Convert M420 to ARGB. -LIBYUV_API -int M420ToARGB(const uint8_t* src_m420, - int src_stride_m420, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*NV12ToARGBRow)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; - if (!src_m420 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToARGBRow = NV12ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToARGBRow = NV12ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, - &kYuvI601Constants, width); - NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, - dst_argb + dst_stride_argb, &kYuvI601Constants, width); - dst_argb += dst_stride_argb * 2; - src_m420 += src_stride_m420 * 3; - } - if (height & 1) { - NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, - &kYuvI601Constants, width); - } - return 0; -} - // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8_t* src_yuy2, @@ -2175,6 +4019,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_LSX; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); src_yuy2 += src_stride_yuy2; @@ -2242,6 +4094,14 @@ int UYVYToARGB(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToARGBRow = UYVYToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_LSX; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); src_uyvy += src_stride_uyvy; @@ -2264,7 +4124,7 @@ static void WeavePixels(const uint8_t* src_u, } } -// Convert Android420 to ARGB. +// Convert Android420 to ARGB with matrix. LIBYUV_API int Android420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, @@ -2365,6 +4225,3144 @@ int Android420ToABGR(const uint8_t* src_y, height); } +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGBARow = I422ToRGBARow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGBARow = I422ToRGBARow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert NV12 to RGB565 with matrix. +LIBYUV_API +int NV12ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_LSX; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, + width, height); +} + +// Convert I422 to RGBA with matrix. +LIBYUV_API +int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif +#if defined(HAS_I422TORGBAROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGBARow = I422ToRGBARow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGBARow = I422ToRGBARow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to RGB24 with matrix. +LIBYUV_API +int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert J420 to RGB24. +LIBYUV_API +int J420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvJPEGConstants, width, height); +} + +// Convert J420 to RAW. +LIBYUV_API +int J420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuJPEGConstants, // Use Yvu matrix + width, height); +} + +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, + width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { + int y; + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, + width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565 with specified color matrix. +LIBYUV_API +int I420ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); +} + +// Convert J420 to RGB565. +LIBYUV_API +int J420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvJPEGConstants, width, height); +} + +// Convert H420 to RGB565. +LIBYUV_API +int H420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvH709Constants, width, height); +} + +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { + int y; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + +// Convert I420 to AR30 with matrix. +LIBYUV_API +int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + +// Convert I420 to AB30. +LIBYUV_API +int I420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H420 to AB30. +LIBYUV_API +int H420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +static int I420ToARGBMatrixBilinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToARGBRow = I444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToARGBRow = I444ToARGBRow_LASX; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + kRowSize; + uint8_t* temp_v_1 = row + kRowSize * 2; + uint8_t* temp_v_2 = row + kRowSize * 3; + + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + I444ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I422ToARGBMatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToARGBRow = I444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToARGBRow = I444ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + kRowSize; + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_u, temp_u, width); + ScaleRowUp(src_v, temp_v, width); + I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I010ToAR30MatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + I410ToAR30Row(src_y, temp_u_2, temp_v_2, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); + } + + free_aligned_buffer_64(row); + + return 0; +} + +static int I210ToAR30MatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_u, temp_u, width); + ScaleRowUp(src_v, temp_v, width); + I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + free_aligned_buffer_64(row); + return 0; +} + +static int I010ToARGBMatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + I410ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I210ToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_u, temp_u, width); + ScaleRowUp(src_v, temp_v, width); + I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I420AlphaToARGBMatrixBilinear( + const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + kRowSize; + uint8_t* temp_v_1 = row + kRowSize * 2; + uint8_t* temp_v_2 = row + kRowSize * 3; + + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + I444AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + src_a += src_stride_a; + } + + if (!(height & 1)) { + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I444AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; + } + } +#endif +#if defined(HAS_I444ALPHATOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_LASX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + kRowSize; + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_u, temp_u, width); + ScaleRowUp(src_v, temp_v, width); + I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I010AlphaToARGBMatrixBilinear( + const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + uint16_t* temp_u_1 = (uint16_t*)(row); + uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + I410AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_a += src_stride_a; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); + Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + } + + free_aligned_buffer_64(row); + return 0; +} + +static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + +#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (width + 31) & ~31; + align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + uint16_t* temp_u = (uint16_t*)(row); + uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_u, temp_u, width); + ScaleRowUp(src_v, temp_v, width); + I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + free_aligned_buffer_64(row); + return 0; +} + +static int P010ToARGBMatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_16_Any_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToARGBRow = P410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToARGBRow = P410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToARGBRow = P410ToARGBRow_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (2 * width + 31) & ~31; + align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + uint16_t* temp_uv_1 = (uint16_t*)(row); + uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize; + + Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + P410ToARGBRow(src_y, temp_uv_2, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + if (!(height & 1)) { + Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P210ToARGBMatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; + void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_16_Any_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToARGBRow = P410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToARGBRow = P410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToARGBRow = P410ToARGBRow_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + const int kRowSize = (2 * width + 31) & ~31; + align_buffer_64(row, kRowSize * sizeof(uint16_t)); + uint16_t* temp_uv = (uint16_t*)(row); + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_uv, temp_uv, width); + P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P010ToAR30MatrixBilinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_16_Any_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToAR30Row = P410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToAR30Row = P410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToAR30Row = P410ToAR30Row_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int kRowSize = (2 * width + 31) & ~31; + align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + uint16_t* temp_uv_1 = (uint16_t*)(row); + uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize; + + Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + P410ToAR30Row(src_y, temp_uv_2, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + if (!(height & 1)) { + Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + +static int P210ToAR30MatrixLinear(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P410ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; + void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_16_Any_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P410ToAR30Row = P410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P410ToAR30Row = P410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P410ToAR30Row = P410ToAR30Row_AVX2; + } + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + const int kRowSize = (2 * width + 31) & ~31; + align_buffer_64(row, kRowSize * sizeof(uint16_t)); + uint16_t* temp_uv = (uint16_t*)(row); + + for (y = 0; y < height; ++y) { + ScaleRowUp(src_uv, temp_uv, width); + P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + + free_aligned_buffer_64(row); + return 0; +} + +LIBYUV_API +int I420ToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + return I420ToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + case kFilterLinear: + // Actually we can do this, but probably there's no usage. + return -1; + } + + return -1; +} + +LIBYUV_API +int I422ToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422ToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I010ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + return I010ToAR30MatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_ar30, dst_stride_ar30, yuvconstants, width, height); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int I210ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210ToAR30MatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_ar30, dst_stride_ar30, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I010ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + return I010ToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int I210ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210ToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_argb, dst_stride_argb, yuvconstants, width, height); + } + + return -1; +} + +LIBYUV_API +int I420AlphaToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + return I420AlphaToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int I422AlphaToARGBMatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422AlphaToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +LIBYUV_API +int I010AlphaToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I010AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + return I010AlphaToARGBMatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int I210AlphaToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I210AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, + src_v, src_stride_v, src_a, src_stride_a, + dst_argb, dst_stride_argb, yuvconstants, + width, height, attenuate); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I210AlphaToARGBMatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, + src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, + attenuate); + } + + return -1; +} + +LIBYUV_API +int P010ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv, + src_stride_uv, dst_argb, dst_stride_argb, + yuvconstants, width, height); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int P210ToARGBMatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P210ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return P210ToARGBMatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv, + dst_argb, dst_stride_argb, yuvconstants, + width, height); + } + + return -1; +} + +LIBYUV_API +int P010ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv, + src_stride_uv, dst_ar30, dst_stride_ar30, + yuvconstants, width, height); + case kFilterLinear: + return -1; + } + + return -1; +} + +LIBYUV_API +int P210ToAR30MatrixFilter(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return P210ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, width, + height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return P210ToAR30MatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv, + dst_ar30, dst_stride_ar30, yuvconstants, + width, height); + } + + return -1; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc index 60140cb4..8bd07e4c 100644 --- a/files/source/convert_from.cc +++ b/files/source/convert_from.cc @@ -30,6 +30,8 @@ static __inline int Abs(int v) { } // I420 To any I4xx YUV format with mirroring. +// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane + static int I420ToI4xx(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, @@ -83,7 +85,8 @@ int I420ToI010(const uint8_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -109,6 +112,51 @@ int I420ToI010(const uint8_t* src_y, return 0; } +// Convert 8 bit YUV to 12 bit. +LIBYUV_API +int I420ToI012(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth, + halfheight); + return 0; +} + // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API @@ -159,6 +207,102 @@ int I420ToI444(const uint8_t* src_y, dst_uv_height); } +// 420 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I010ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width), + Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma to 444 chroma, 10/12 bit version +LIBYUV_API +int I210ToI410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I422ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, + dst_stride_u, Abs(width), Abs(height), kFilterBilinear); + ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, + dst_stride_v, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API int I400Copy(const uint8_t* src_y, @@ -302,11 +446,11 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOYUY2ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToYUY2Row = I422ToYUY2Row_MMI; +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; } } #endif @@ -389,11 +533,11 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; } } #endif @@ -464,11 +608,11 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; } } #endif @@ -488,7 +632,6 @@ int I420ToUYVY(const uint8_t* src_y, return 0; } -// TODO(fbarchard): test negative height for invert. LIBYUV_API int I420ToNV12(const uint8_t* src_y, int src_stride_y, @@ -502,12 +645,22 @@ int I420ToNV12(const uint8_t* src_y, int dst_stride_uv, int width, int height) { - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || - height == 0) { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) { return -1; } - int halfwidth = (width + 1) / 2; - int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } @@ -534,807 +687,6 @@ int I420ToNV21(const uint8_t* src_y, width, height); } -// Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGBA. -LIBYUV_API -int I420ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I420 to BGRA. -LIBYUV_API -int I420ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; - dst_stride_rgb24 = -dst_stride_rgb24; - } -#if defined(HAS_I422TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToRGB24Row = I422ToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB24Row = I422ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB24Row = I422ToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB24Row = I422ToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); - dst_rgb24 += dst_stride_rgb24; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB24. -LIBYUV_API -int I420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, width, height); -} - -// Convert I420 to RAW. -LIBYUV_API -int I420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert H420 to RGB24. -LIBYUV_API -int H420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvH709Constants, width, height); -} - -// Convert H420 to RAW. -LIBYUV_API -int H420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to ARGB1555. -LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height) { - int y; - void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; - dst_stride_argb1555 = -dst_stride_argb1555; - } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB1555Row = I422ToARGB1555Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, - width); - dst_argb1555 += dst_stride_argb1555; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to ARGB4444. -LIBYUV_API -int I420ToARGB4444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height) { - int y; - void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; - dst_stride_argb4444 = -dst_stride_argb4444; - } -#if defined(HAS_I422TOARGB4444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB4444Row = I422ToARGB4444Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, - width); - dst_argb4444 += dst_stride_argb4444; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565 with specified color matrix. -LIBYUV_API -int I420ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565. -LIBYUV_API -int I420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvI601Constants, width, height); -} - -// Convert J420 to RGB565. -LIBYUV_API -int J420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvJPEGConstants, width, height); -} - -// Convert H420 to RGB565. -LIBYUV_API -int H420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvH709Constants, width, height); -} - -// Convert I422 to RGB565. -LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8_t kDither565_4x4[16] = { - 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, -}; - -// Convert I420 to RGB565 with dithering. -LIBYUV_API -int I420ToRGB565Dither(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height) { - int y; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = - ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } - if (!dither4x4) { - dither4x4 = kDither565_4x4; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; - } - } -#endif - { - // Allocate a row of argb. - align_buffer_64(row_argb, width * 4); - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); - ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), - width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - free_aligned_buffer_64(row_argb); - } - return 0; -} - -// Convert I420 to AR30 with matrix -static int I420ToAR30Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToAR30Row_C; - - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } - -#if defined(HAS_I422TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToAR30Row = I422ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToAR30Row = I422ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToAR30Row = I422ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToAR30Row = I422ToAR30Row_AVX2; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to AR30. -LIBYUV_API -int I420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvI601Constants, width, height); -} - -// Convert H420 to AR30. -LIBYUV_API -int H420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYvuH709Constants, width, height); -} - // Convert I420 to specified format LIBYUV_API int ConvertFromI420(const uint8_t* y, @@ -1421,7 +773,8 @@ int ConvertFromI420(const uint8_t* y, height); break; case FOURCC_NV12: { - uint8_t* dst_uv = dst_sample + width * height; + int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_uv = dst_sample + dst_y_stride * height; r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_uv, dst_sample_stride ? dst_sample_stride : width, width, @@ -1429,14 +782,14 @@ int ConvertFromI420(const uint8_t* y, break; } case FOURCC_NV21: { - uint8_t* dst_vu = dst_sample + width * height; + int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_vu = dst_sample + dst_y_stride * height; r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_vu, dst_sample_stride ? dst_sample_stride : width, width, height); break; } - // TODO(fbarchard): Add M420. // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc index fbcd039d..e50c2af3 100644 --- a/files/source/convert_from_argb.cc +++ b/files/source/convert_from_argb.cc @@ -76,11 +76,11 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOUV444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUV444Row = ARGBToUV444Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToUV444Row = ARGBToUV444Row_MMI; +#if defined(HAS_ARGBTOUV444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_LASX; } } #endif @@ -103,7 +103,7 @@ int ARGBToI444(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -116,11 +116,11 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; } } #endif @@ -170,30 +170,42 @@ int ARGBToI422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -206,37 +218,26 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif - -#if defined(HAS_ARGBTOYROW_MSA) +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif @@ -279,30 +280,10 @@ int ARGBToNV12(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -315,35 +296,57 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; + ARGBToYRow = ARGBToYRow_AVX2; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif @@ -379,11 +382,11 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; } } #endif @@ -439,30 +442,42 @@ int ARGBToNV21(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -475,39 +490,28 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif - #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -540,11 +544,11 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; } } #endif @@ -599,30 +603,42 @@ int ABGRToNV12(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) +#if defined(HAS_ABGRTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToUVRow = ABGRToUVRow_Any_AVX2; ABGRToYRow = ABGRToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_AVX2; ABGRToYRow = ABGRToYRow_AVX2; } } #endif +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_NEON; } } @@ -635,35 +651,167 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOYROW_MSA) +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_MSA; } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } } #endif -#if defined(HAS_ABGRTOUVROW_MSA) +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - ABGRToUVRow = ABGRToUVRow_Any_MSA; + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ABGRToYRow(src_abgr, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ABGRToNV21(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = + ABGRToYRow_C; + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYRow = ABGRToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_MSA; + ABGRToYRow = ABGRToYRow_AVX2; } } #endif -#if defined(HAS_ABGRTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToYRow = ABGRToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_MMI; +#if defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; } } #endif -#if defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToUVRow = ABGRToUVRow_Any_MMI; +#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; + ABGRToYRow = ABGRToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_MSA; } } #endif @@ -699,11 +847,11 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; } } #endif @@ -714,16 +862,16 @@ int ABGRToNV12(const uint8_t* src_abgr, for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ABGRToYRow(src_abgr, dst_y, width); ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; + dst_vu += dst_stride_vu; } if (height & 1) { ABGRToUVRow(src_abgr, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ABGRToYRow(src_abgr, dst_y, width); } free_aligned_buffer_64(row_u); @@ -764,30 +912,42 @@ int ARGBToYUY2(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yuy2 = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -800,35 +960,25 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif @@ -864,11 +1014,11 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif -#if defined(HAS_I422TOYUY2ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToYUY2Row = I422ToYUY2Row_MMI; +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; } } #endif @@ -925,30 +1075,42 @@ int ARGBToUYVY(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_uyvy = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2; } } #endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -961,35 +1123,25 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MSA) +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; } } #endif @@ -1025,11 +1177,11 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; } } #endif @@ -1097,7 +1249,7 @@ int ARGBToI400(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1110,11 +1262,11 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; } } #endif @@ -1195,7 +1347,7 @@ int ARGBToRGB24(const uint8_t* src_argb, #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } } @@ -1208,11 +1360,11 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB24Row = ARGBToRGB24Row_MMI; +#if defined(HAS_ARGBTORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_LASX; } } #endif @@ -1282,11 +1434,11 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORAWROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRAWRow = ARGBToRAWRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRAWRow = ARGBToRAWRow_MMI; +#if defined(HAS_ARGBTORAWROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_LASX; } } #endif @@ -1360,11 +1512,11 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; } } #endif @@ -1437,11 +1589,11 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_MMI; +#if defined(HAS_ARGBTORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_LASX; } } #endif @@ -1511,11 +1663,11 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOARGB1555ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_MMI; +#if defined(HAS_ARGBTOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LASX; } } #endif @@ -1585,11 +1737,11 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOARGB4444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_MMI; +#if defined(HAS_ARGBTOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LASX; } } #endif @@ -1727,16 +1879,22 @@ int ARGBToJ420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; @@ -1748,7 +1906,7 @@ int ARGBToJ420(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -1761,35 +1919,35 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif -#if defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; +#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + ARGBToUVJRow = ARGBToUVJRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; + ARGBToYJRow = ARGBToYJRow_LSX; + ARGBToUVJRow = ARGBToUVJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; } } #endif @@ -1844,16 +2002,22 @@ int ARGBToJ422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; @@ -1865,7 +2029,7 @@ int ARGBToJ422(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -1878,35 +2042,35 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) +#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif -#if defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; +#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + ARGBToUVJRow = ARGBToUVJRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; + ARGBToYJRow = ARGBToYJRow_LSX; + ARGBToUVJRow = ARGBToUVJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; } } #endif @@ -1922,6 +2086,124 @@ int ARGBToJ422(const uint8_t* src_argb, return 0; } +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + // Convert ARGB to J400. LIBYUV_API int ARGBToJ400(const uint8_t* src_argb, @@ -1966,7 +2248,7 @@ int ARGBToJ400(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -1979,23 +2261,282 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert RGBA to J400. +LIBYUV_API +int RGBAToJ400(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = + RGBAToYJRow_C; + if (!src_rgba || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } + // Coalesce rows. + if (src_stride_rgba == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_rgba = dst_stride_yj = 0; + } +#if defined(HAS_RGBATOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGBAToYJRow = RGBAToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_RGBATOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGBAToYJRow = RGBAToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBAToYJRow = RGBAToYJRow_AVX2; + } + } +#endif +#if defined(HAS_RGBATOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYJRow = RGBAToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYJRow = RGBAToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_MSA; } } #endif for (y = 0; y < height; ++y) { - ARGBToYJRow(src_argb, dst_yj, width); - src_argb += src_stride_argb; + RGBAToYJRow(src_rgba, dst_yj, width); + src_rgba += src_stride_rgba; dst_yj += dst_stride_yj; } return 0; } +// Enabled if 1 pass is available +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#define HAS_RAWTOYJROW +#endif + +// RAW to JNV21 full range NV21 +LIBYUV_API +int RAWToJNV21(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + int halfwidth = (width + 1) >> 1; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#endif // HAS_RAWTOYJROW +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow_ = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_LSX; + } + } +#endif + { + // Allocate a row of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); +#if !defined(HAS_RAWTOYJROW) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVJRow(row, kRowSize, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RAWTOYJROW) + free_aligned_buffer_64(row); +#endif + free_aligned_buffer_64(row_u); + } + return 0; +} +#undef HAS_RAWTOYJROW + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc index f440c7c2..d7556ee9 100644 --- a/files/source/convert_jpeg.cc +++ b/files/source/convert_jpeg.cc @@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg, return ret ? 0 : 1; } +static void JpegI420ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI422ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI444ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 with VU swapped. + I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], + dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +static void JpegI400ToNV12(void* opaque, + const uint8_t* const* data, + const int* strides, + int rows) { + NV21Buffers* dest = (NV21Buffers*)(opaque); + // Use NV21 since there is no UV plane. + I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, + dest->vu_stride, dest->w, rows); + dest->y += rows * dest->y_stride; + dest->vu += ((rows + 1) >> 1) * dest->vu_stride; + dest->h -= rows; +} + +// MJPG (Motion JPEG) to NV12. +LIBYUV_API +int MJPGToNV12(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int src_width, + int src_height, + int dst_width, + int dst_height) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + // Use NV21Buffers but with UV instead of VU. + NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv, + dst_stride_uv, dst_width, dst_height}; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width, + dst_height); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width, + dst_height); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width, + dst_height); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width, + dst_height); + } else { + // Unknown colorspace. + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + struct ARGBBuffers { uint8_t* argb; int argb_stride; diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc index bde1aa88..84df16c8 100644 --- a/files/source/convert_to_argb.cc +++ b/files/source/convert_to_argb.cc @@ -32,9 +32,6 @@ extern "C" { // TODO(fbarchard): Add the following: // H010ToARGB // I010ToARGB -// J400ToARGB -// J422ToARGB -// J444ToARGB LIBYUV_API int ConvertToARGB(const uint8_t* sample, @@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample, r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; + case FOURCC_J400: + src = sample + src_width * crop_y + crop_x; + r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; // Biplanar formats case FOURCC_NV12: @@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample, r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; - case FOURCC_M420: - src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { @@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample, break; } + case FOURCC_J420: { + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + case FOURCC_H420: { int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; @@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample, break; } - case FOURCC_J420: { + case FOURCC_U420: { int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; const uint8_t* src_y = sample + (src_width * crop_y + crop_x); @@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample, (halfwidth * crop_y + crop_x) / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } @@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample, break; } + case FOURCC_J422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + case FOURCC_H422: { int halfwidth = (src_width + 1) / 2; const uint8_t* src_y = sample + src_width * crop_y + crop_x; @@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample, break; } + case FOURCC_U422: { + int halfwidth = (src_width + 1) / 2; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u = + sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; + const uint8_t* src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + case FOURCC_I444: case FOURCC_YV24: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; @@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } + + case FOURCC_J444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_H444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + + case FOURCC_U444: { + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); + break; + } + #ifdef HAVE_JPEG case FOURCC_MJPG: r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc index 584be0ac..5869ecd7 100644 --- a/files/source/convert_to_i420.cc +++ b/files/source/convert_to_i420.cc @@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample, switch (format) { // Single plane formats - case FOURCC_YUY2: + case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix. + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); break; - case FOURCC_UYVY: + } + case FOURCC_UYVY: { + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); break; + } case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, @@ -179,11 +187,6 @@ int ConvertToI420(const uint8_t* sample, dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, crop_width, inv_crop_height, rotation); break; - case FOURCC_M420: - src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, crop_width, inv_crop_height); - break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc index 48e2b615..56fe60e4 100644 --- a/files/source/cpu_id.cc +++ b/files/source/cpu_id.cc @@ -20,7 +20,7 @@ #endif // For ArmCpuCaps() but unittested on all platforms -#include <stdio.h> +#include <stdio.h> // For fopen() #include <string.h> #ifdef __cplusplus @@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) { asm volatile( #if defined(__i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" + "mov %%ebx, %%edi \n" "cpuid \n" - "xchg %%edi, %%ebx \n" + "xchg %%edi, %%ebx \n" : "=D"(info_ebx), #else "cpuid \n" @@ -133,7 +133,7 @@ int GetXCR0() { #pragma optimize("g", on) #endif -// based on libvpx arm_cpudetect.c +// Based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; @@ -163,45 +163,54 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { } // TODO(fbarchard): Consider read_msa_ir(). -// TODO(fbarchard): Add unittest. -LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, - const char ase[]) { +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; + int flag = 0x0; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { - // ase enabled if /proc/cpuinfo is unavailable. - if (strcmp(ase, " msa") == 0) { - return kCpuHasMSA; - } - if (strcmp(ase, " mmi") == 0) { - return kCpuHasMMI; - } + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. return 0; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { - if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { - char* p = strstr(cpuinfo_line, ase); - if (p) { - fclose(f); - if (strcmp(ase, " msa") == 0) { - return kCpuHasMSA; - } - return 0; + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without MSA in ASEs line. + if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= kCpuHasMSA; } - } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { - char* p = strstr(cpuinfo_line, "Loongson-3"); - if (p) { - fclose(f); - if (strcmp(ase, " mmi") == 0) { - return kCpuHasMMI; - } - return 0; + } + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + if (strstr(cpuinfo_line, "msa")) { + flag |= kCpuHasMSA; } + // ASEs is the last line, so we can break here. + break; } } fclose(f); - return 0; + return flag; +} + +// TODO(fbarchard): Consider read_loongarch_ir(). +#define LOONGARCH_CFG2 0x2 +#define LOONGARCH_CFG2_LSX (1 << 6) +#define LOONGARCH_CFG2_LASX (1 << 7) + +#if defined(__loongarch__) +LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) { + int flag = 0x0; + uint32_t cfg2 = 0; + + __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2)); + + if (cfg2 & LOONGARCH_CFG2_LSX) + flag |= kCpuHasLSX; + + if (cfg2 & LOONGARCH_CFG2_LASX) + flag |= kCpuHasLASX; + return flag; } +#endif static SAFEBUFFERS int GetCpuFlags(void) { int cpu_info = 0; @@ -235,6 +244,7 @@ static SAFEBUFFERS int GetCpuFlags(void) { cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; @@ -242,13 +252,13 @@ static SAFEBUFFERS int GetCpuFlags(void) { } #endif #if defined(__mips__) && defined(__linux__) -#if defined(__mips_msa) - cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); -#elif defined(_MIPS_ARCH_LOONGSON3A) - cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi"); -#endif + cpu_info = MipsCpuCaps("/proc/cpuinfo"); cpu_info |= kCpuHasMIPS; #endif +#if defined(__loongarch__) && defined(__linux__) + cpu_info = LoongarchCpuCaps(); + cpu_info |= kCpuHasLOONGARCH; +#endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc index 5c5e5ead..4ccf00a3 100644 --- a/files/source/mjpeg_decoder.cc +++ b/files/source/mjpeg_decoder.cc @@ -417,7 +417,6 @@ void init_source(j_decompress_ptr cinfo) { boolean fill_input_buffer(j_decompress_ptr cinfo) { BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data); if (buf_vec->pos >= buf_vec->len) { - assert(0 && "No more data"); // ERROR: No more data return FALSE; } @@ -430,7 +429,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) { void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT jpeg_source_mgr* src = cinfo->src; size_t bytes = static_cast<size_t>(num_bytes); - if(bytes > src->bytes_in_buffer) { + if (bytes > src->bytes_in_buffer) { src->next_input_byte = nullptr; src->bytes_in_buffer = 0; } else { diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc index 9cab230f..169d4a8f 100644 --- a/files/source/planar_functions.cc +++ b/files/source/planar_functions.cc @@ -10,6 +10,7 @@ #include "libyuv/planar_functions.h" +#include <assert.h> #include <string.h> // for memset() #include "libyuv/cpu_id.h" @@ -34,6 +35,9 @@ void CopyPlane(const uint8_t* src_y, int height) { int y; void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -80,8 +84,6 @@ void CopyPlane(const uint8_t* src_y, } } -// TODO(fbarchard): Consider support for negative height. -// TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API void CopyPlane_16(const uint16_t* src_y, int src_stride_y, @@ -89,36 +91,8 @@ void CopyPlane_16(const uint16_t* src_y, int dst_stride_y, int width, int height) { - int y; - void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } -#if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_16_SSE2; - } -#endif -#if defined(HAS_COPYROW_16_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_16_ERMS; - } -#endif -#if defined(HAS_COPYROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_16_NEON; - } -#endif - - // Copy plane - for (y = 0; y < height; ++y) { - CopyRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } + CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y, + dst_stride_y * 2, width * 2, height); } // Convert a plane of 16 bit data to 8 bit @@ -134,6 +108,9 @@ void Convert16To8Plane(const uint16_t* src_y, void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) = Convert16To8Row_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -146,6 +123,14 @@ void Convert16To8Plane(const uint16_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } +#if defined(HAS_CONVERT16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert16To8Row = Convert16To8Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_NEON; + } + } +#endif #if defined(HAS_CONVERT16TO8ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { Convert16To8Row = Convert16To8Row_Any_SSSE3; @@ -184,6 +169,9 @@ void Convert8To16Plane(const uint8_t* src_y, void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) = Convert8To16Row_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -238,9 +226,12 @@ int I422Copy(const uint8_t* src_y, int width, int height) { int halfwidth = (width + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; @@ -276,7 +267,8 @@ int I444Copy(const uint8_t* src_y, int dst_stride_v, int width, int height) { - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -298,6 +290,49 @@ int I444Copy(const uint8_t* src_y, return 0; } +// Copy I210. +LIBYUV_API +int I210Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + // Copy I400. LIBYUV_API int I400ToI400(const uint8_t* src_y, @@ -349,6 +384,54 @@ int I420ToI400(const uint8_t* src_y, return 0; } +// Copy NV12. Supports inverting. +int NV12Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, + halfheight); + return 0; +} + +// Copy NV21. Supports inverting. +int NV21Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_vu, dst_stride_vu, width, height); +} + // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API @@ -363,6 +446,9 @@ void SplitUVPlane(const uint8_t* src_uv, int y; void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -410,11 +496,11 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; +#if defined(HAS_SPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SplitUVRow = SplitUVRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_LSX; } } #endif @@ -440,6 +526,9 @@ void MergeUVPlane(const uint8_t* src_u, int y; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -485,11 +574,11 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow = MergeUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MergeUVRow = MergeUVRow_MMI; +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow = MergeUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_LSX; } } #endif @@ -503,6 +592,289 @@ void MergeUVPlane(const uint8_t* src_u, } } +// Support function for P010 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + int y; + void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, + uint16_t* dst_v, int depth, int width) = + SplitUVRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow_16 = SplitUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow_16 = SplitUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow_16 = SplitUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SplitUVRow_16 = SplitUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane_16(const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int depth) { + int y; + void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, + uint16_t* dst_uv, int depth, int width) = + MergeUVRow_16_C; + assert(depth >= 8); + assert(depth <= 16); + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow_16 = MergeUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow_16 = MergeUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow_16 = MergeUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeUVRow_16 = MergeUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow_16(src_u, src_v, dst_uv, depth, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + +// Convert plane from lsb to msb +LIBYUV_API +void ConvertToMSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << (16 - depth); + void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = MultiplyRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_MULTIPLYROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MultiplyRow_16 = MultiplyRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MultiplyRow_16 = MultiplyRow_16_AVX2; + } + } +#endif +#if defined(HAS_MULTIPLYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MultiplyRow_16 = MultiplyRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MultiplyRow_16 = MultiplyRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MultiplyRow_16(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert plane from msb to lsb +LIBYUV_API +void ConvertToLSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << depth; + void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = DivideRow_16_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_DIVIDEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + DivideRow = DivideRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + DivideRow = DivideRow_16_AVX2; + } + } +#endif +#if defined(HAS_DIVIDEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DivideRow = DivideRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DivideRow = DivideRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + DivideRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Swap U and V channels in interleaved UV plane. +LIBYUV_API +void SwapUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = + SwapUVRow_C; + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) { + width *= height; + height = 1; + src_stride_uv = dst_stride_vu = 0; + } + +#if defined(HAS_SWAPUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SwapUVRow = SwapUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SwapUVRow = SwapUVRow_SSSE3; + } + } +#endif +#if defined(HAS_SWAPUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SwapUVRow = SwapUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SwapUVRow = SwapUVRow_AVX2; + } + } +#endif +#if defined(HAS_SWAPUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SwapUVRow = SwapUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SwapUVRow = SwapUVRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + SwapUVRow(src_uv, dst_vu, width); + src_uv += src_stride_uv; + dst_vu += dst_stride_vu; + } +} + // Convert NV21 to NV12. LIBYUV_API int NV21ToNV12(const uint8_t* src_y, @@ -515,49 +887,150 @@ int NV21ToNV12(const uint8_t* src_y, int dst_stride_uv, int width, int height) { - int y; - void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = - UVToVURow_C; - int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; + if (!src_vu || !dst_uv || width <= 0 || height == 0) { return -1; } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; src_vu = src_vu + (halfheight - 1) * src_stride_vu; - src_stride_y = -src_stride_y; src_stride_vu = -src_stride_vu; } - // Coalesce rows. - if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) { - halfwidth *= halfheight; - halfheight = 1; - src_stride_vu = dst_stride_uv = 0; + + SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + +// Detile a plane of data +// tile width is 16 and assumed. +// tile_height is 16 or 32 for MM21. +// src_stride_y is bytes per row of source ignoring tiling. e.g. 640 +// TODO: More detile row functions. + +LIBYUV_API +void DetilePlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, + int width) = DetileRow_C; + assert(src_stride_y >= 0); + assert(tile_height > 0); + assert(src_stride_y > 0); + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; } -#if defined(HAS_UVToVUROW_NEON) +#if defined(HAS_DETILEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileRow = DetileRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_SSE2; + } + } +#endif +#if defined(HAS_DETILEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - UVToVURow = UVToVURow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - UVToVURow = UVToVURow_NEON; + DetileRow = DetileRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_NEON; } } #endif - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + // Detile plane + for (y = 0; y < height; ++y) { + DetileRow(src_y, src_tile_stride, dst_y, width); + dst_y += dst_stride_y; + src_y += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_tile_stride + src_stride_y * tile_height; + } } +} - for (y = 0; y < halfheight; ++y) { - UVToVURow(src_vu, dst_uv, halfwidth); - src_vu += src_stride_vu; - dst_uv += dst_stride_uv; +LIBYUV_API +void DetileSplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride, + uint8_t* dst_u, uint8_t* dst_v, int width) = + DetileSplitUVRow_C; + assert(src_stride_uv >= 0); + assert(tile_height > 0); + assert(src_stride_uv > 0); + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_stride_u = -dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_v = -dst_stride_v; + } + +#if defined(HAS_DETILESPLITUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_SSSE3; + } + } +#endif +#if defined(HAS_DETILESPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileSplitUVRow = DetileSplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height; + } } - return 0; } // Support function for NV12 etc RGB channels. @@ -576,6 +1049,9 @@ void SplitRGBPlane(const uint8_t* src_rgb, int y; void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) = SplitRGBRow_C; + if (width <= 0 || height == 0) { + return; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -609,14 +1085,6 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif -#if defined(HAS_SPLITRGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitRGBRow = SplitRGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - SplitRGBRow = SplitRGBRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { // Copy a row of RGB. @@ -643,6 +1111,9 @@ void MergeRGBPlane(const uint8_t* src_r, void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) = MergeRGBRow_C; + if (width <= 0 || height == 0) { + return; + } // Coalesce rows. // Negative height means invert the image. if (height < 0) { @@ -673,86 +1144,673 @@ void MergeRGBPlane(const uint8_t* src_r, } } #endif -#if defined(HAS_MERGERGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeRGBRow = MergeRGBRow_Any_MMI; + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + +LIBYUV_NOINLINE +void SplitARGBPlaneAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int y; + void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, uint8_t* dst_a, int width) = + SplitARGBRow_C; + + assert(height > 0); + + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = + dst_stride_a = 0; + } + +#if defined(HAS_SPLITARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitARGBRow = SplitARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - MergeRGBRow = MergeRGBRow_MMI; + SplitARGBRow = SplitARGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitARGBRow = SplitARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitARGBRow = SplitARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitARGBRow = SplitARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitARGBRow = SplitARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { - // Merge a row of U and V into a row of RGB. - MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + dst_a += dst_stride_a; + src_argb += src_stride_argb; + } +} + +LIBYUV_NOINLINE +void SplitARGBPlaneOpaque(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitXRGBRow_C; + assert(height > 0); + + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } + +#if defined(HAS_SPLITXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitXRGBRow = SplitXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitXRGBRow = SplitXRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitXRGBRow = SplitXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitXRGBRow = SplitXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_argb += src_stride_argb; + } +} + +LIBYUV_API +void SplitARGBPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_a = dst_a + (height - 1) * dst_stride_a; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + dst_stride_a = -dst_stride_a; + } + + if (dst_a == NULL) { + SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, width, height); + } else { + SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a, + width, height); + } +} + +LIBYUV_NOINLINE +void MergeARGBPlaneAlpha(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, const uint8_t* src_a, + uint8_t* dst_argb, int width) = MergeARGBRow_C; + + assert(height > 0); + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeARGBRow = MergeARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeARGBRow = MergeARGBRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGBRow = MergeARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGBRow = MergeARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; - dst_rgb += dst_stride_rgb; + src_a += src_stride_a; + dst_argb += dst_stride_argb; } } -// Mirror a plane of data. -void MirrorPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { +LIBYUV_NOINLINE +void MergeARGBPlaneOpaque(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_argb, int width) = + MergeXRGBRow_C; + + assert(height > 0); + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeXRGBRow = MergeXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeXRGBRow = MergeXRGBRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGBRow = MergeXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGBRow = MergeXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_API +void MergeARGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Negative height means invert the image. if (height < 0) { height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_MIRRORROW_NEON) + + if (src_a == NULL) { + MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height); + } else { + MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height); + } +} + +// TODO(yuan): Support 2 bit alpha channel. +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth) { + int y; + void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_ar30, int depth, + int width) = MergeXR30Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; + } +#if defined(HAS_MERGEXR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR30Row = MergeXR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR30Row = MergeXR30Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXR30ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - MirrorRow = MirrorRow_Any_NEON; + if (depth == 10) { + MergeXR30Row = MergeXR30Row_10_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_10_NEON; + } + } else { + MergeXR30Row = MergeXR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_NEON; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar30 += dst_stride_ar30; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint16_t* dst_argb, int depth, int width) = + MergeAR64Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeAR64Row = MergeAR64Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_NEON; + MergeAR64Row = MergeAR64Row_AVX2; } } #endif -#if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; +#if defined(HAS_MERGEAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeAR64Row = MergeAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeAR64Row = MergeAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint16_t* dst_argb, int depth, + int width) = MergeXR64Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEXR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR64Row = MergeXR64Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; + MergeXR64Row = MergeXR64Row_AVX2; } } #endif -#if defined(HAS_MIRRORROW_AVX2) +#if defined(HAS_MERGEXR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXR64Row = MergeXR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR64Row = MergeXR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; + dst_stride_ar64 = -dst_stride_ar64; + } + + if (src_a == NULL) { + MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_ar64, dst_stride_ar64, width, height, + depth); + } else { + MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_ar64, + dst_stride_ar64, width, height, depth); + } +} + +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint8_t* dst_argb, int depth, int width) = + MergeARGB16To8Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGB16TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - MirrorRow = MirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; + MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGB16To8Row = MergeARGB16To8Row_AVX2; } } #endif -#if defined(HAS_MIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorRow = MirrorRow_Any_MSA; - if (IS_ALIGNED(width, 64)) { - MirrorRow = MirrorRow_MSA; +#if defined(HAS_MERGEARGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeARGB16To8Row = MergeARGB16To8Row_NEON; } } #endif -#if defined(HAS_MIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MirrorRow = MirrorRow_Any_MMI; + + for (y = 0; y < height; ++y) { + MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_argb, int depth, + int width) = MergeXRGB16To8Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXRGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - MirrorRow = MirrorRow_MMI; + MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; } } #endif - // Mirror plane for (y = 0; y < height; ++y) { - MirrorRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; + MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + if (src_a == NULL) { + MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height, depth); + } else { + MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height, depth); } } @@ -820,7 +1878,7 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MSA) +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; @@ -830,13 +1888,13 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; - YUY2ToUV422Row = YUY2ToUV422Row_MMI; +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_LASX; } } #endif @@ -916,7 +1974,7 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOYROW_MSA) +#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { UYVYToYRow = UYVYToYRow_Any_MSA; UYVYToUV422Row = UYVYToUV422Row_Any_MSA; @@ -926,13 +1984,13 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_UYVYTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToYRow = UYVYToYRow_Any_MMI; - UYVYToUV422Row = UYVYToUV422Row_Any_MMI; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_MMI; - UYVYToUV422Row = UYVYToUV422Row_MMI; +#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUV422Row = UYVYToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUV422Row = UYVYToUV422Row_LASX; } } #endif @@ -1006,23 +2064,214 @@ int YUY2ToY(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert UYVY to Y. +LIBYUV_API +int UYVYToY(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + } + } +#endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; } } #endif for (y = 0; y < height; ++y) { - YUY2ToYRow(src_yuy2, dst_y, width); - src_yuy2 += src_stride_yuy2; + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; dst_y += dst_stride_y; } return 0; } +// Mirror a plane of data. +// See Also I400Mirror +LIBYUV_API +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } + } +#endif +#if defined(HAS_MIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorRow = MirrorRow_Any_LASX; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_LASX; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Mirror a plane of UV data. +LIBYUV_API +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = + MirrorUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorUVRow = MirrorUVRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorUVRow = MirrorUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorUVRow = MirrorUVRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_AVX2; + } + } +#endif +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorUVRow = MirrorUVRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_MSA; + } + } +#endif +#if defined(HAS_MIRRORUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorUVRow = MirrorUVRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_LASX; + } + } +#endif + + // MirrorUV plane + for (y = 0; y < height; ++y) { + MirrorUVRow(src_uv, dst_uv, width); + src_uv += src_stride_uv; + dst_uv += dst_stride_uv; + } +} + // Mirror I400 with optional flipping LIBYUV_API int I400Mirror(const uint8_t* src_y, @@ -1063,10 +2312,12 @@ int I420Mirror(const uint8_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + + if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; @@ -1087,6 +2338,43 @@ int I420Mirror(const uint8_t* src_y, return 0; } +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + // ARGB mirror. LIBYUV_API int ARGBMirror(const uint8_t* src_argb, @@ -1110,7 +2398,7 @@ int ARGBMirror(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -1139,11 +2427,11 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMirrorRow = ARGBMirrorRow_MMI; +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; } } #endif @@ -1157,6 +2445,52 @@ int ARGBMirror(const uint8_t* src_argb, return 0; } +// RGB24 mirror. +LIBYUV_API +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + RGB24MirrorRow_C; + if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } +#if defined(HAS_RGB24MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24MirrorRow = RGB24MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_NEON; + } + } +#endif +#if defined(HAS_RGB24MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_SSSE3; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + RGB24MirrorRow(src_rgb24, dst_rgb24, width); + src_rgb24 += src_stride_rgb24; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + // Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. @@ -1180,9 +2514,9 @@ ARGBBlendRow GetARGBBlend() { ARGBBlendRow = ARGBBlendRow_MSA; } #endif -#if defined(HAS_ARGBBLENDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBBlendRow = ARGBBlendRow_MMI; +#if defined(HAS_ARGBBLENDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBBlendRow = ARGBBlendRow_LSX; } #endif return ARGBBlendRow; @@ -1277,14 +2611,6 @@ int BlendPlane(const uint8_t* src_y0, } } #endif -#if defined(HAS_BLENDPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BlendPlaneRow = BlendPlaneRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - BlendPlaneRow = BlendPlaneRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); @@ -1329,6 +2655,7 @@ int I420Blend(const uint8_t* src_y0, BlendPlaneRow_C; void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -1361,14 +2688,6 @@ int I420Blend(const uint8_t* src_y0, } } #endif -#if defined(HAS_BLENDPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BlendPlaneRow = BlendPlaneRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - BlendPlaneRow = BlendPlaneRow_MMI; - } - } -#endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } @@ -1405,17 +2724,6 @@ int I420Blend(const uint8_t* src_y0, } } #endif -#if defined(HAS_SCALEROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI; - if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - ScaleRowDown2 = ScaleRowDown2Box_MMI; - } - } - } -#endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); @@ -1501,11 +2809,11 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBMULTIPLYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMultiplyRow = ARGBMultiplyRow_MMI; +#if defined(HAS_ARGBMULTIPLYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_LASX; } } #endif @@ -1549,12 +2857,12 @@ int ARGBAdd(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__)) +#if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_SSE2; } #endif -#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__)) +#if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { @@ -1586,11 +2894,11 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBADDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAddRow = ARGBAddRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAddRow = ARGBAddRow_MMI; +#if defined(HAS_ARGBADDROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAddRow = ARGBAddRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_LASX; } } #endif @@ -1666,11 +2974,11 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif -#if defined(HAS_ARGBSUBTRACTROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBSubtractRow = ARGBSubtractRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBSubtractRow = ARGBSubtractRow_MMI; +#if defined(HAS_ARGBSUBTRACTROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_LASX; } } #endif @@ -1684,177 +2992,6 @@ int ARGBSubtract(const uint8_t* src_argb0, } return 0; } -// Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*NV12ToRGB565Row)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_NV12TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToRGB565Row = NV12ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} // Convert RAW to RGB24. LIBYUV_API @@ -1906,11 +3043,11 @@ int RAWToRGB24(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToRGB24Row = RAWToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RAWToRGB24Row = RAWToRGB24Row_MMI; +#if defined(HAS_RAWTORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToRGB24Row = RAWToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_LSX; } } #endif @@ -1931,6 +3068,10 @@ void SetPlane(uint8_t* dst_y, uint32_t value) { int y; void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; + + if (width <= 0 || height == 0) { + return; + } if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1968,6 +3109,14 @@ void SetPlane(uint8_t* dst_y, SetRow = SetRow_MSA; } #endif +#if defined(HAS_SETROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SetRow = SetRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_LSX; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1996,6 +3145,7 @@ int I420Rect(uint8_t* dst_y, uint8_t* start_y = dst_y + y * dst_stride_y + x; uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { @@ -2057,6 +3207,14 @@ int ARGBRect(uint8_t* dst_argb, } } #endif +#if defined(HAS_ARGBSETROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBSetRow = ARGBSetRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_LSX; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -2135,11 +3293,11 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; } } #endif @@ -2243,9 +3401,9 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif -#if defined(HAS_ARGBGRAYROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBGrayRow = ARGBGrayRow_MMI; +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; } #endif @@ -2293,9 +3451,9 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif -#if defined(HAS_ARGBGRAYROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBGrayRow = ARGBGrayRow_MMI; +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; } #endif @@ -2341,9 +3499,9 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif -#if defined(HAS_ARGBSEPIAROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBSepiaRow = ARGBSepiaRow_MMI; +#if defined(HAS_ARGBSEPIAROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBSepiaRow = ARGBSepiaRow_LASX; } #endif @@ -2397,9 +3555,9 @@ int ARGBColorMatrix(const uint8_t* src_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; } #endif -#if defined(HAS_ARGBCOLORMATRIXROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; +#if defined(HAS_ARGBCOLORMATRIXROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_LSX; } #endif for (y = 0; y < height; ++y) { @@ -2567,6 +3725,11 @@ int ARGBQuantize(uint8_t* dst_argb, ARGBQuantizeRow = ARGBQuantizeRow_MSA; } #endif +#if defined(HAS_ARGBQUANTIZEROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_LSX; + } +#endif for (y = 0; y < height; ++y) { ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); dst += dst_stride_argb; @@ -2596,11 +3759,6 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb, ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; - } -#endif memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. for (y = 0; y < height; ++y) { @@ -2651,7 +3809,7 @@ int ARGBBlur(const uint8_t* src_argb, if (radius > (width / 2 - 1)) { radius = width / 2 - 1; } - if (radius <= 0) { + if (radius <= 0 || height <= 1) { return -1; } #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) @@ -2660,11 +3818,6 @@ int ARGBBlur(const uint8_t* src_argb, CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; - } -#endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, @@ -2771,9 +3924,9 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif -#if defined(HAS_ARGBSHADEROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBShadeRow = ARGBShadeRow_MMI; +#if defined(HAS_ARGBSHADEROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_LASX; } #endif @@ -2847,11 +4000,11 @@ int InterpolatePlane(const uint8_t* src0, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -2865,6 +4018,86 @@ int InterpolatePlane(const uint8_t* src0, return 0; } +// Interpolate 2 planes by specified amount (0 to 255). +LIBYUV_API +int InterpolatePlane_16(const uint16_t* src0, + int src_stride0, + const uint16_t* src1, + int src_stride1, + uint16_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { + int y; + void (*InterpolateRow_16)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + if (!src0 || !src1 || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst = dst + (height - 1) * dst_stride; + dst_stride = -dst_stride; + } + // Coalesce rows. + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { + width *= height; + height = 1; + src_stride0 = src_stride1 = dst_stride = 0; + } +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow_16 = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow_16 = InterpolateRow_16_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow_16 = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow_16 = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + InterpolateRow_16 = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow_16 = InterpolateRow_16_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow_16 = InterpolateRow_16_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow_16 = InterpolateRow_16_LSX; + } + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow_16(dst, src0, src1 - src0, width, interpolation); + src0 += src_stride0; + src1 += src_stride1; + dst += dst_stride; + } + return 0; +} + // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API int ARGBInterpolate(const uint8_t* src_argb0, @@ -2906,10 +4139,12 @@ int I420Interpolate(const uint8_t* src0_y, int interpolation) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, dst_stride_y, width, height, interpolation); InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, @@ -2978,11 +4213,11 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif -#if defined(HAS_ARGBSHUFFLEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBShuffleRow = ARGBShuffleRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBShuffleRow = ARGBShuffleRow_MMI; +#if defined(HAS_ARGBSHUFFLEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_LASX; } } #endif @@ -2995,6 +4230,142 @@ int ARGBShuffle(const uint8_t* src_bgra, return 0; } +// Shuffle AR64 channel order. e.g. AR64 to AB64. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, + const uint8_t* shuffler, int width) = AR64ShuffleRow_C; + if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_ar64 = 0; + } + // Assembly versions can be reused if it's implemented with shuffle. +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + AR64ShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + AR64ShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + AR64ShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, + width * 2); + src_ar64 += src_stride_ar64; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Gauss blur a float plane using Gaussian 5x5 filter with +// coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +// Edge is 2 pixels on each side, and interior is multiple of 4. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height) { + int y; + void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, + const float* src3, const float* src4, float* dst, + int width) = GaussCol_F32_C; + void (*GaussRow_F32)(const float* src, float* dst, int width) = + GaussRow_F32_C; + if (!src || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + +#if defined(HAS_GAUSSCOL_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussCol_F32 = GaussCol_F32_NEON; + } +#endif +#if defined(HAS_GAUSSROW_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussRow_F32 = GaussRow_F32_NEON; + } +#endif + { + // 2 pixels on each side, but aligned out to 16 bytes. + align_buffer_64(rowbuf, (4 + width + 4) * 4); + memset(rowbuf, 0, 16); + memset(rowbuf + (4 + width) * 4, 0, 16); + float* row = (float*)(rowbuf + 16); + const float* src0 = src; + const float* src1 = src; + const float* src2 = src; + const float* src3 = src2 + ((height > 1) ? src_stride : 0); + const float* src4 = src3 + ((height > 2) ? src_stride : 0); + + for (y = 0; y < height; ++y) { + GaussCol_F32(src0, src1, src2, src3, src4, row, width); + + // Extrude edge by 2 floats + row[-2] = row[-1] = row[0]; + row[width + 1] = row[width] = row[width - 1]; + + GaussRow_F32(row - 2, dst, width); + + src0 = src1; + src1 = src2; + src2 = src3; + src3 = src4; + if ((y + 2) < (height - 1)) { + src4 += src_stride; + } + dst += dst_stride; + } + free_aligned_buffer_64(rowbuf); + } + return 0; +} + // Sobel ARGB effect. static int ARGBSobelize(const uint8_t* src_argb, int src_stride_argb, @@ -3044,7 +4415,7 @@ static int ARGBSobelize(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -3057,11 +4428,19 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; +#if defined(HAS_ARGBTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYJRow = ARGBToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; } } #endif @@ -3081,11 +4460,6 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelYRow = SobelYRow_MSA; } #endif -#if defined(HAS_SOBELYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelYRow = SobelYRow_MMI; - } -#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -3101,11 +4475,6 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelXRow = SobelXRow_MSA; } #endif -#if defined(HAS_SOBELXROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelXRow = SobelXRow_MMI; - } -#endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; @@ -3188,11 +4557,11 @@ int ARGBSobel(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelRow = SobelRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelRow = SobelRow_MMI; +#if defined(HAS_SOBELROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelRow = SobelRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_LSX; } } #endif @@ -3234,11 +4603,11 @@ int ARGBSobelToPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELTOPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelToPlaneRow = SobelToPlaneRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelToPlaneRow = SobelToPlaneRow_MMI; +#if defined(HAS_SOBELTOPLANEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelToPlaneRow = SobelToPlaneRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_LSX; } } #endif @@ -3281,11 +4650,11 @@ int ARGBSobelXY(const uint8_t* src_argb, } } #endif -#if defined(HAS_SOBELXYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelXYRow = SobelXYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelXYRow = SobelXYRow_MMI; +#if defined(HAS_SOBELXYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SobelXYRow = SobelXYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_LSX; } } #endif @@ -3412,6 +4781,14 @@ int HalfFloatPlane(const uint16_t* src_y, } } #endif +#if defined(HAS_HALFFLOATROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + HalfFloatRow = HalfFloatRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_LSX; + } + } +#endif for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); @@ -3526,14 +4903,6 @@ int ARGBCopyAlpha(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBCOPYALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { ARGBCopyAlphaRow(src_argb, dst_argb, width); @@ -3592,10 +4961,10 @@ int ARGBExtractAlpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_MSA; } #endif -#if defined(HAS_ARGBEXTRACTALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI - : ARGBExtractAlphaRow_Any_MMI; +#if defined(HAS_ARGBEXTRACTALPHAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX + : ARGBExtractAlphaRow_Any_LSX; } #endif @@ -3649,14 +5018,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, } } #endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); @@ -3685,9 +5046,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2, void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; @@ -3726,11 +5089,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; +#if defined(HAS_SPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SplitUVRow = SplitUVRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_LSX; } } #endif @@ -3766,11 +5129,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -3817,9 +5180,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; @@ -3858,11 +5223,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; +#if defined(HAS_SPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + SplitUVRow = SplitUVRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_LSX; } } #endif @@ -3898,11 +5263,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -3933,6 +5298,56 @@ int UYVYToNV12(const uint8_t* src_uyvy, return 0; } +// width and height are src size allowing odd size handling. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint8_t* dst_uv, int width) = HalfMergeUVRow_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_HALFMERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_NEON; + } +#endif +#if defined(HAS_HALFMERGEUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_SSSE3; + } +#endif +#if defined(HAS_HALFMERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + HalfMergeUVRow = HalfMergeUVRow_AVX2; + } +#endif + for (y = 0; y < height - 1; y += 2) { + // Merge a row of U and V into a row of UV. + HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/rotate.cc b/files/source/rotate.cc index d414186a..f1e83cbd 100644 --- a/files/source/rotate.cc +++ b/files/source/rotate.cc @@ -29,16 +29,20 @@ void TransposePlane(const uint8_t* src, int width, int height) { int i = height; -#if defined(HAS_TRANSPOSEWX16_MSA) +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx16_C; #else void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx8_C; #endif + #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { - TransposeWx8 = TransposeWx8_NEON; + TransposeWx8 = TransposeWx8_Any_NEON; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_NEON; + } } #endif #if defined(HAS_TRANSPOSEWX8_SSSE3) @@ -49,11 +53,6 @@ void TransposePlane(const uint8_t* src, } } #endif -#if defined(HAS_TRANSPOSEWX8_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - TransposeWx8 = TransposeWx8_MMI; - } -#endif #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; @@ -70,8 +69,16 @@ void TransposePlane(const uint8_t* src, } } #endif +#if defined(HAS_TRANSPOSEWX16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + TransposeWx16 = TransposeWx16_Any_LSX; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_LSX; + } + } +#endif -#if defined(HAS_TRANSPOSEWX16_MSA) +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) // Work across the source in 16x16 tiles while (i >= 16) { TransposeWx16(src, src_stride, dst, dst_stride, width); @@ -142,7 +149,7 @@ void RotatePlane180(const uint8_t* src, #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } @@ -171,11 +178,11 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_MIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MirrorRow = MirrorRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MirrorRow = MirrorRow_MMI; +#if defined(HAS_MIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorRow = MirrorRow_Any_LASX; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_LASX; } } #endif @@ -199,19 +206,14 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - MirrorRow(src, row, width); // Mirror first row into a buffer - src += src_stride; + CopyRow(src, row, width); // Copy first row into buffer MirrorRow(src_bot, dst, width); // Mirror last row into first row + MirrorRow(row, dst_bot, width); // Mirror buffer into last row + src += src_stride; dst += dst_stride; - CopyRow(row, dst_bot, width); // Copy first mirrored row into last src_bot -= src_stride; dst_bot -= dst_stride; } @@ -219,24 +221,44 @@ void RotatePlane180(const uint8_t* src, } LIBYUV_API -void TransposeUV(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { +void SplitTransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i = height; #if defined(HAS_TRANSPOSEUVWX16_MSA) void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx16_C; +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; #else void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; #endif + +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } + } +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + TransposeUVWx16 = TransposeUVWx16_Any_LSX; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_LSX; + } + } +#else #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; @@ -250,22 +272,7 @@ void TransposeUV(const uint8_t* src, } } #endif -#if defined(HAS_TRANSPOSEUVWX8_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - TransposeUVWx8 = TransposeUVWx8_Any_MMI; - if (IS_ALIGNED(width, 4)) { - TransposeUVWx8 = TransposeUVWx8_MMI; - } - } -#endif -#if defined(HAS_TRANSPOSEUVWX16_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - TransposeUVWx16 = TransposeUVWx16_Any_MSA; - if (IS_ALIGNED(width, 8)) { - TransposeUVWx16 = TransposeUVWx16_MSA; - } - } -#endif +#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ #if defined(HAS_TRANSPOSEUVWX16_MSA) // Work through the source in 8x8 tiles. @@ -277,6 +284,16 @@ void TransposeUV(const uint8_t* src, dst_b += 16; // Move over 8 columns. i -= 16; } +#elif defined(HAS_TRANSPOSEUVWX16_LSX) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } #else // Work through the source in 8x8 tiles. while (i >= 8) { @@ -296,70 +313,70 @@ void TransposeUV(const uint8_t* src, } LIBYUV_API -void RotateUV90(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { +void SplitRotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { src += src_stride * (height - 1); src_stride = -src_stride; - TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, - height); + SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, height); } LIBYUV_API -void RotateUV270(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { +void SplitRotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { dst_a += dst_stride_a * (width - 1); dst_b += dst_stride_b * (width - 1); dst_stride_a = -dst_stride_a; dst_stride_b = -dst_stride_b; - TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, - height); + SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, height); } // Rotate 180 is a horizontal and vertical flip. LIBYUV_API -void RotateUV180(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { +void SplitRotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i; - void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, - int width) = MirrorUVRow_C; -#if defined(HAS_MIRRORUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_NEON; + void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorSplitUVRow_C; +#if defined(HAS_MIRRORSPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif -#if defined(HAS_MIRRORUVROW_SSSE3) +#if defined(HAS_MIRRORSPLITUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorUVRow = MirrorUVRow_SSSE3; + MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_MSA) +#if defined(HAS_MIRRORSPLITUVROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { - MirrorUVRow = MirrorUVRow_MSA; + MirrorSplitUVRow = MirrorSplitUVRow_MSA; } #endif -#if defined(HAS_MIRRORUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_MMI; +#if defined(HAS_MIRRORSPLITUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_LSX; } #endif @@ -367,13 +384,59 @@ void RotateUV180(const uint8_t* src, dst_b += dst_stride_b * (height - 1); for (i = 0; i < height; ++i) { - MirrorUVRow(src, dst_a, dst_b, width); + MirrorSplitUVRow(src, dst_a, dst_b, width); src += src_stride; dst_a -= dst_stride_a; dst_b -= dst_stride_b; } } +// Rotate UV and split into planar. +// width and height expected to be half size for NV12 +LIBYUV_API +int SplitRotateUV(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate90: + SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate270: + SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + case kRotate180: + SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); + return 0; + default: + break; + } + return -1; +} + LIBYUV_API int RotatePlane(const uint8_t* src, int src_stride, @@ -431,8 +494,8 @@ int I420Rotate(const uint8_t* src_y, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || - !dst_u || !dst_v) { + if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { return -1; } @@ -482,6 +545,80 @@ int I420Rotate(const uint8_t* src_y, } LIBYUV_API +int I422Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; + case kRotate90: + // We need to rotate and rescale, we use plane Y as temporal storage. + RotatePlane90(src_u, src_stride_u, dst_y, height, halfwidth, height); + ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight, + halfheight, width, kFilterBilinear); + RotatePlane90(src_v, src_stride_v, dst_y, height, halfwidth, height); + ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight, + halfheight, width, kFilterLinear); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; + case kRotate270: + // We need to rotate and rescale, we use plane Y as temporal storage. + RotatePlane270(src_u, src_stride_u, dst_y, height, halfwidth, height); + ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight, + halfheight, width, kFilterBilinear); + RotatePlane270(src_v, src_stride_v, dst_y, height, halfwidth, height); + ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight, + halfheight, width, kFilterLinear); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + height); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API int I444Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, @@ -496,7 +633,7 @@ int I444Rotate(const uint8_t* src_y, int dst_stride_v, int width, int height, - enum libyuv::RotationMode mode) { + enum RotationMode mode) { if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || !dst_u || !dst_v) { return -1; @@ -514,23 +651,23 @@ int I444Rotate(const uint8_t* src_y, } switch (mode) { - case libyuv::kRotate0: + case kRotate0: // copy frame CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; - case libyuv::kRotate90: + case kRotate90: RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; - case libyuv::kRotate270: + case kRotate270: RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; - case libyuv::kRotate180: + case kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height); @@ -580,18 +717,18 @@ int NV12ToI420Rotate(const uint8_t* src_y, width, height); case kRotate90: RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); + SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); + SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); + SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; default: break; @@ -599,6 +736,98 @@ int NV12ToI420Rotate(const uint8_t* src_y, return -1; } +static void SplitPixels(const uint8_t* src_u, + int src_pixel_stride_uv, + uint8_t* dst_u, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst_u = *src_u; + ++dst_u; + src_u += src_pixel_stride_uv; + } +} + +// Convert Android420 to I420 with Rotate +LIBYUV_API +int Android420ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode rotation) { + int y; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + rotation); + } + + // Copy UV planes - I420 + if (src_pixel_stride_uv == 1) { + RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight, + rotation); + RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight, + rotation); + return 0; + } + // Split UV planes - NV21 + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, + halfwidth, halfheight, rotation); + return 0; + } + // Split UV planes - NV12 + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, + halfwidth, halfheight, rotation); + return 0; + } + + if (rotation == 0) { + for (y = 0; y < halfheight; ++y) { + SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); + SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; + } + // unsupported type and/or rotation. + return -1; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc index b3baf084..88ca7876 100644 --- a/files/source/rotate_any.cc +++ b/files/source/rotate_any.cc @@ -35,15 +35,15 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #ifdef HAS_TRANSPOSEWX8_SSSE3 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif -#ifdef HAS_TRANSPOSEWX8_MMI -TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7) -#endif #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif #ifdef HAS_TRANSPOSEWX16_MSA TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) #endif +#ifdef HAS_TRANSPOSEWX16_LSX +TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, 15) +#endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ @@ -65,12 +65,12 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_MMI -TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7) -#endif #ifdef HAS_TRANSPOSEUVWX16_MSA TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) #endif +#ifdef HAS_TRANSPOSEUVWX16_LSX +TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7) +#endif #undef TUVANY #ifdef __cplusplus diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc index a93fd55f..539cf98d 100644 --- a/files/source/rotate_argb.cc +++ b/files/source/rotate_argb.cc @@ -21,17 +21,21 @@ namespace libyuv { extern "C" { #endif -static void ARGBTranspose(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int i; int src_pixel_step = src_stride_argb >> 2; void (*ScaleARGBRowDownEven)( const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; + // Check stride is a multiple of 4. + if (src_stride_argb & 3) { + return -1; + } #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; @@ -56,11 +60,11 @@ static void ARGBTranspose(const uint8_t* src_argb, } } #endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX; if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI; + ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX; } } #endif @@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb, dst_argb += dst_stride_argb; src_argb += 4; } + return 0; } -void ARGBRotate90(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src_argb += src_stride_argb * (height - 1); src_stride_argb = -src_stride_argb; - ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); } -void ARGBRotate270(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst_argb += dst_stride_argb * (width - 1); dst_stride_argb = -dst_stride_argb; - ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); + return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); } -void ARGBRotate180(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static int ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); @@ -121,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -150,11 +155,11 @@ void ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMirrorRow = ARGBMirrorRow_MMI; +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; } } #endif @@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb, dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); + return 0; } LIBYUV_API @@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb, return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: - ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); case kRotate270: - ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); case kRotate180: - ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, - height); - return 0; + return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height); default: break; } diff --git a/files/source/rotate_dspr2.cc b/files/source/rotate_dspr2.cc deleted file mode 100644 index 5d2338de..00000000 --- a/files/source/rotate_dspr2.cc +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) - -void TransposeWx8_DSPR2(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, - int width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" - // dst + dst_stride word aligned - "1: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "sw $s0, 0(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "sw $s1, 4(%[dst]) \n" - "bnez %[width], 1b \n" - " addu %[dst], %[dst], %[dst_stride] \n" - "b 2f \n" - // dst + dst_stride unaligned - "11: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "swr $s0, 0(%[dst]) \n" - "swl $s0, 3(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "swr $s1, 4(%[dst]) \n" - "swl $s1, 7(%[dst]) \n" - "bnez %[width], 11b \n" - "addu %[dst], %[dst], %[dst_stride] \n" - "2: \n" - ".set pop \n" - : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) - : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1"); -} - -void TransposeWx8_Fast_DSPR2(const uint8* src, - int src_stride, - uint8* dst, - int dst_stride, - int width) { - __asm__ __volatile__( - ".set noat \n" - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - - "srl $AT, %[width], 0x2 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" - // dst + dst_stride word aligned - "1: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - - // t0 = | 30 | 20 | 10 | 00 | - // t1 = | 31 | 21 | 11 | 01 | - // t8 = | 32 | 22 | 12 | 02 | - // t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - - // t0 = | 34 | 24 | 14 | 04 | - // t1 = | 35 | 25 | 15 | 05 | - // t8 = | 36 | 26 | 16 | 06 | - // t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "sw $s4, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $s6, 0($s0) \n" - "sw $t8, 4($s0) \n" - "sw $s5, 0($s1) \n" - "sw $t1, 4($s1) \n" - "sw $s7, 0($s2) \n" - "sw $t9, 4($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 1b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "b 2f \n" - // dst + dst_stride unaligned - "11: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - - // t0 = | 30 | 20 | 10 | 00 | - // t1 = | 31 | 21 | 11 | 01 | - // t8 = | 32 | 22 | 12 | 02 | - // t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - - // t0 = | 34 | 24 | 14 | 04 | - // t1 = | 35 | 25 | 15 | 05 | - // t8 = | 36 | 26 | 16 | 06 | - // t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "swr $s4, 0(%[dst]) \n" - "swl $s4, 3(%[dst]) \n" - "swr $t0, 4(%[dst]) \n" - "swl $t0, 7(%[dst]) \n" - "swr $s6, 0($s0) \n" - "swl $s6, 3($s0) \n" - "swr $t8, 4($s0) \n" - "swl $t8, 7($s0) \n" - "swr $s5, 0($s1) \n" - "swl $s5, 3($s1) \n" - "swr $t1, 4($s1) \n" - "swl $t1, 7($s1) \n" - "swr $s7, 0($s2) \n" - "swl $s7, 3($s2) \n" - "swr $t9, 4($s2) \n" - "swl $t9, 7($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 11b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "2: \n" - ".set pop \n" - ".set at \n" - : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) - : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", - "s2", "s3", "s4", "s5", "s6", "s7"); -} - -void TransposeUVWx8_DSPR2(const uint8* src, - int src_stride, - uint8* dst_a, - int dst_stride_a, - uint8* dst_b, - int dst_stride_b, - int width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "subu $t7, $t9, %[src_stride] \n" - "srl $t1, %[width], 1 \n" - - // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b - "andi $t0, %[dst_a], 0x3 \n" - "andi $t8, %[dst_b], 0x3 \n" - "or $t0, $t0, $t8 \n" - "andi $t8, %[dst_stride_a], 0x3 \n" - "andi $s5, %[dst_stride_b], 0x3 \n" - "or $t8, $t8, $s5 \n" - "or $t0, $t0, $t8 \n" - "bnez $t0, 11f \n" - " nop \n" - // dst + dst_stride word aligned (both, a & b dst addresses) - "1: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "sw $s3, 0($s5) \n" - "sw $s4, 0($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "sw $s3, 0(%[dst_a]) \n" - "sw $s4, 0(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - "sw $s3, 4($s5) \n" - "sw $s4, 4($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "sw $s3, 4(%[dst_a]) \n" - "sw $s4, 4(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 1b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - "b 2f \n" - " nop \n" - - // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned - "11: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "swr $s3, 0($s5) \n" - "swl $s3, 3($s5) \n" - "swr $s4, 0($s6) \n" - "swl $s4, 3($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "swr $s3, 0(%[dst_a]) \n" - "swl $s3, 3(%[dst_a]) \n" - "swr $s4, 0(%[dst_b]) \n" - "swl $s4, 3(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - - "swr $s3, 4($s5) \n" - "swl $s3, 7($s5) \n" - "swr $s4, 4($s6) \n" - "swl $s4, 7($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "swr $s3, 4(%[dst_a]) \n" - "swl $s3, 7(%[dst_a]) \n" - "swr $s4, 4(%[dst_b]) \n" - "swl $s4, 7(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 11b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - - "2: \n" - ".set pop \n" - : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b), - [width] "+r"(width), [src_stride] "+r"(src_stride) - : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", - "s2", "s3", "s4", "s5", "s6"); -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc index 04e19e29..1a3f8cbb 100644 --- a/files/source/rotate_gcc.cc +++ b/files/source/rotate_gcc.cc @@ -17,8 +17,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) @@ -31,75 +30,75 @@ void TransposeWx8_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -121,127 +120,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -266,95 +265,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/files/source/rotate_lsx.cc b/files/source/rotate_lsx.cc new file mode 100644 index 00000000..94a2b91c --- /dev/null +++ b/files/source/rotate_lsx.cc @@ -0,0 +1,243 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \ + } + +#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \ + } + +#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \ + _stride3, _stride4) \ + { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + } + +#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \ + { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + _dst += _stride2; \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_LSX(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + int len = width / 16; + uint8_t* s; + int src_stride2 = src_stride << 1; + int src_stride3 = src_stride + src_stride2; + int src_stride4 = src_stride2 << 1; + int dst_stride2 = dst_stride << 1; + int dst_stride3 = dst_stride + dst_stride2; + int dst_stride4 = dst_stride2 << 1; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < len; x++) { + s = (uint8_t*)src; + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + res8 = __lsx_vilvl_w(reg4, reg0); + res9 = __lsx_vilvh_w(reg4, reg0); + ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg5, reg1); + res9 = __lsx_vilvh_w(reg5, reg1); + ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg6, reg2); + res9 = __lsx_vilvh_w(reg6, reg2); + ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + res8 = __lsx_vilvl_w(reg7, reg3); + res9 = __lsx_vilvh_w(reg7, reg3); + ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3, + dst_stride4); + src += 16; + } +} + +void TransposeUVWx16_LSX(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + int len = width / 8; + uint8_t* s; + int src_stride2 = src_stride << 1; + int src_stride3 = src_stride + src_stride2; + int src_stride4 = src_stride2 << 1; + int dst_stride_a2 = dst_stride_a << 1; + int dst_stride_b2 = dst_stride_b << 1; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < len; x++) { + s = (uint8_t*)src; + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3); + src0 = __lsx_vld(s, 0); + src1 = __lsx_vldx(s, src_stride); + src2 = __lsx_vldx(s, src_stride2); + src3 = __lsx_vldx(s, src_stride3); + s += src_stride4; + ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3); + ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7); + res8 = __lsx_vilvl_w(reg4, reg0); + res9 = __lsx_vilvh_w(reg4, reg0); + ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg5, reg1); + res9 = __lsx_vilvh_w(reg5, reg1); + ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg6, reg2); + res9 = __lsx_vilvh_w(reg6, reg2); + ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + res8 = __lsx_vilvl_w(reg7, reg3); + res9 = __lsx_vilvh_w(reg7, reg3); + ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2); + LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2); + src += 16; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc index fdc0dd47..844df2bf 100644 --- a/files/source/rotate_neon.cc +++ b/files/source/rotate_neon.cc @@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld1.8 {d0}, [%0], %2 \n" - "vld1.8 {d1}, [%0], %2 \n" - "vld1.8 {d2}, [%0], %2 \n" - "vld1.8 {d3}, [%0], %2 \n" - "vld1.8 {d4}, [%0], %2 \n" - "vld1.8 {d5}, [%0], %2 \n" - "vld1.8 {d6}, [%0], %2 \n" - "vld1.8 {d7}, [%0] \n" - - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" - - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - - "mov %0, %3 \n" - - "vst1.8 {d1}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d3}, [%0], %4 \n" - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d5}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d7}, [%0], %4 \n" - "vst1.8 {d6}, [%0] \n" - - "add %1, #8 \n" // src += 8 - "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride - "subs %5, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld1.8 {d0}, [%0], %2 \n" + "vld1.8 {d1}, [%0], %2 \n" + "vld1.8 {d2}, [%0], %2 \n" + "vld1.8 {d3}, [%0], %2 \n" + "vld1.8 {d4}, [%0], %2 \n" + "vld1.8 {d5}, [%0], %2 \n" + "vld1.8 {d6}, [%0], %2 \n" + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + "vst1.8 {d1}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d3}, [%0], %4 \n" + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d5}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d7}, [%0], %4 \n" + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. @@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src, // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "vld2.8 {d0, d1}, [%0], %2 \n" - "vld2.8 {d2, d3}, [%0], %2 \n" - "vld2.8 {d4, d5}, [%0], %2 \n" - "vld2.8 {d6, d7}, [%0], %2 \n" - "vld2.8 {d16, d17}, [%0], %2 \n" - "vld2.8 {d18, d19}, [%0], %2 \n" - "vld2.8 {d20, d21}, [%0], %2 \n" - "vld2.8 {d22, d23}, [%0] \n" - - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" - - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" - - "mov %0, %3 \n" - - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d6}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d18}, [%0], %4 \n" - "vst1.8 {d16}, [%0], %4 \n" - "vst1.8 {d22}, [%0], %4 \n" - "vst1.8 {d20}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.8 {d3}, [%0], %6 \n" - "vst1.8 {d1}, [%0], %6 \n" - "vst1.8 {d7}, [%0], %6 \n" - "vst1.8 {d5}, [%0], %6 \n" - "vst1.8 {d19}, [%0], %6 \n" - "vst1.8 {d17}, [%0], %6 \n" - "vst1.8 {d23}, [%0], %6 \n" - "vst1.8 {d21}, [%0] \n" - - "add %1, #8*2 \n" // src += 8*2 - "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %7, #8 \n" // w -= 8 - "bge 1b \n" + "mov %0, %1 \n" + + "vld2.8 {d0, d1}, [%0], %2 \n" + "vld2.8 {d2, d3}, [%0], %2 \n" + "vld2.8 {d4, d5}, [%0], %2 \n" + "vld2.8 {d6, d7}, [%0], %2 \n" + "vld2.8 {d16, d17}, [%0], %2 \n" + "vld2.8 {d18, d19}, [%0], %2 \n" + "vld2.8 {d20, d21}, [%0], %2 \n" + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + "vst1.8 {d2}, [%0], %4 \n" + "vst1.8 {d0}, [%0], %4 \n" + "vst1.8 {d6}, [%0], %4 \n" + "vst1.8 {d4}, [%0], %4 \n" + "vst1.8 {d18}, [%0], %4 \n" + "vst1.8 {d16}, [%0], %4 \n" + "vst1.8 {d22}, [%0], %4 \n" + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + "vst1.8 {d3}, [%0], %6 \n" + "vst1.8 {d1}, [%0], %6 \n" + "vst1.8 {d7}, [%0], %6 \n" + "vst1.8 {d5}, [%0], %6 \n" + "vst1.8 {d19}, [%0], %6 \n" + "vst1.8 {d17}, [%0], %6 \n" + "vst1.8 {d23}, [%0], %6 \n" + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc index f469baac..43c15817 100644 --- a/files/source/rotate_neon64.cc +++ b/files/source/rotate_neon64.cc @@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w3, %w3, #8 \n" + "sub %w3, %w3, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + "1: \n" + "mov %0, %1 \n" + + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" "mov %0, %1 \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" - - "trn2 v16.8b, v0.8b, v1.8b \n" - "trn1 v17.8b, v0.8b, v1.8b \n" - "trn2 v18.8b, v2.8b, v3.8b \n" - "trn1 v19.8b, v2.8b, v3.8b \n" - "trn2 v20.8b, v4.8b, v5.8b \n" - "trn1 v21.8b, v4.8b, v5.8b \n" - "trn2 v22.8b, v6.8b, v7.8b \n" - "trn1 v23.8b, v6.8b, v7.8b \n" - - "trn2 v3.4h, v17.4h, v19.4h \n" - "trn1 v1.4h, v17.4h, v19.4h \n" - "trn2 v2.4h, v16.4h, v18.4h \n" - "trn1 v0.4h, v16.4h, v18.4h \n" - "trn2 v7.4h, v21.4h, v23.4h \n" - "trn1 v5.4h, v21.4h, v23.4h \n" - "trn2 v6.4h, v20.4h, v22.4h \n" - "trn1 v4.4h, v20.4h, v22.4h \n" - - "trn2 v21.2s, v1.2s, v5.2s \n" - "trn1 v17.2s, v1.2s, v5.2s \n" - "trn2 v20.2s, v0.2s, v4.2s \n" - "trn1 v16.2s, v0.2s, v4.2s \n" - "trn2 v23.2s, v3.2s, v7.2s \n" - "trn1 v19.2s, v3.2s, v7.2s \n" - "trn2 v22.2s, v2.2s, v6.2s \n" - "trn1 v18.2s, v2.2s, v6.2s \n" + "trn2 v16.8b, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "trn1 v17.8b, v0.8b, v1.8b \n" + "add %0, %0, %5 \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 1 + "trn1 v19.8b, v2.8b, v3.8b \n" + "add %0, %0, %5 \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 2 + "trn1 v21.8b, v4.8b, v5.8b \n" + "add %0, %0, %5 \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // row 3 + "trn1 v23.8b, v6.8b, v7.8b \n" + "add %0, %0, %5 \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 4 + "trn1 v1.4h, v17.4h, v19.4h \n" + "add %0, %0, %5 \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 5 + "trn1 v0.4h, v16.4h, v18.4h \n" + "add %0, %0, %5 \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 6 + "trn1 v5.4h, v21.4h, v23.4h \n" + "add %0, %0, %5 \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "prfm pldl1keep, [%0, 448] \n" // row 7 + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" "mov %0, %2 \n" - "st1 {v17.8b}, [%0], %6 \n" - "st1 {v16.8b}, [%0], %6 \n" - "st1 {v19.8b}, [%0], %6 \n" - "st1 {v18.8b}, [%0], %6 \n" - "st1 {v21.8b}, [%0], %6 \n" - "st1 {v20.8b}, [%0], %6 \n" - "st1 {v23.8b}, [%0], %6 \n" - "st1 {v22.8b}, [%0] \n" + "st1 {v17.8b}, [%0], %6 \n" + "st1 {v16.8b}, [%0], %6 \n" + "st1 {v19.8b}, [%0], %6 \n" + "st1 {v18.8b}, [%0], %6 \n" + "st1 {v21.8b}, [%0], %6 \n" + "st1 {v20.8b}, [%0], %6 \n" + "st1 {v23.8b}, [%0], %6 \n" + "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride @@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src, // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %w3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. @@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src, // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w4, %w4, #8 \n" + "sub %w4, %w4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" - "mov %0, %1 \n" - - "ld1 {v0.16b}, [%0], %5 \n" - "ld1 {v1.16b}, [%0], %5 \n" - "ld1 {v2.16b}, [%0], %5 \n" - "ld1 {v3.16b}, [%0], %5 \n" - "ld1 {v4.16b}, [%0], %5 \n" - "ld1 {v5.16b}, [%0], %5 \n" - "ld1 {v6.16b}, [%0], %5 \n" - "ld1 {v7.16b}, [%0] \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "mov %0, %1 \n" - "mov %0, %2 \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" + "mov %0, %1 \n" - "st1 {v16.d}[0], [%0], %6 \n" - "st1 {v18.d}[0], [%0], %6 \n" - "st1 {v17.d}[0], [%0], %6 \n" - "st1 {v19.d}[0], [%0], %6 \n" - "st1 {v16.d}[1], [%0], %6 \n" - "st1 {v18.d}[1], [%0], %6 \n" - "st1 {v17.d}[1], [%0], %6 \n" - "st1 {v19.d}[1], [%0] \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %3 \n" + "mov %0, %2 \n" - "st1 {v20.d}[0], [%0], %7 \n" - "st1 {v22.d}[0], [%0], %7 \n" - "st1 {v21.d}[0], [%0], %7 \n" - "st1 {v23.d}[0], [%0], %7 \n" - "st1 {v20.d}[1], [%0], %7 \n" - "st1 {v22.d}[1], [%0], %7 \n" - "st1 {v21.d}[1], [%0], %7 \n" - "st1 {v23.d}[1], [%0] \n" - - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * // dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * // dst_stride_b - "subs %w4, %w4, #8 \n" // w -= 8 - "b.ge 1b \n" + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %w4, %w4, #8 \n" - "b.eq 4f \n" + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %w4, #2 \n" - "b.lt 3f \n" + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %w4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" // TODO(frkoenig): Clean this up // 4x8 block diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc index e887dd52..a78873f8 100644 --- a/files/source/rotate_win.cc +++ b/files/source/rotate_win.cc @@ -16,8 +16,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, diff --git a/files/source/row_any.cc b/files/source/row_any.cc index 06ca723a..3781a9f2 100644 --- a/files/source/row_any.cc +++ b/files/source/row_any.cc @@ -30,6 +30,39 @@ extern "C" { // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) +// Any 4 planes to 1 +#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_MERGEARGBROW_SSE2 +ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) +#endif +#ifdef HAS_MERGEARGBROW_AVX2 +ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_MERGEARGBROW_NEON +ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) +#endif + +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. // Any 4 planes to 1 with yuvconstants #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -46,26 +79,163 @@ extern "C" { memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 192, a_buf + n, r); \ + if (width & 1) { \ + temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + } \ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ yuvconstants, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ SS(r, DUVSHIFT) * BPP); \ } +#ifdef HAS_I444ALPHATOARGBROW_SSSE3 +ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) +#endif +#ifdef HAS_I444ALPHATOARGBROW_AVX2 +ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) +#endif #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #endif +#ifdef HAS_I444ALPHATOARGBROW_NEON +ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7) +#endif #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif +#ifdef HAS_I444ALPHATOARGBROW_MSA +ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) +#endif #ifdef HAS_I422ALPHATOARGBROW_MSA ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_LASX +ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) +#endif #undef ANY41C +// Any 4 planes to 1 plane of 8 bit with yuvconstants +#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +ANY41CT(I210AlphaToARGBRow_Any_SSSE3, + I210AlphaToARGBRow_SSSE3, + 1, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I210ALPHATOARGBROW_AVX2 +ANY41CT(I210AlphaToARGBRow_Any_AVX2, + I210AlphaToARGBRow_AVX2, + 1, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +ANY41CT(I410AlphaToARGBRow_Any_SSSE3, + I410AlphaToARGBRow_SSSE3, + 0, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_AVX2 +ANY41CT(I410AlphaToARGBRow_Any_AVX2, + I410AlphaToARGBRow_AVX2, + 0, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#undef ANY41CT + +// Any 4 planes to 1 plane with parameter +#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 4]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEAR64ROW_AVX2 +ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEAR64ROW_NEON +ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +ANY41PT(MergeARGB16To8Row_Any_AVX2, + MergeARGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_NEON +ANY41PT(MergeARGB16To8Row_Any_NEON, + MergeARGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY41PT + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -92,8 +262,14 @@ ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) #ifdef HAS_MERGERGBROW_NEON ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #endif -#ifdef HAS_MERGERGBROW_MMI -ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) +#ifdef HAS_MERGEXRGBROW_SSE2 +ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) +#endif +#ifdef HAS_MERGEXRGBROW_AVX2 +ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_MERGEXRGBROW_NEON +ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) @@ -109,8 +285,8 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif -#ifdef HAS_I422TOYUY2ROW_MMI -ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) +#ifdef HAS_I422TOYUY2ROW_LASX +ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31) #endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) @@ -118,8 +294,8 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif -#ifdef HAS_I422TOUYVYROW_MMI -ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) +#ifdef HAS_I422TOUYVYROW_LASX +ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) #endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) @@ -127,9 +303,6 @@ ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #ifdef HAS_BLENDPLANEROW_SSSE3 ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) #endif -#ifdef HAS_BLENDPLANEROW_MMI -ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) -#endif #undef ANY31 // Note that odd width replication includes 444 due to implementation @@ -162,6 +335,21 @@ ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif +#ifdef HAS_I422TORGBAROW_SSSE3 +ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOARGB4444ROW_SSSE3 +ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TOARGB1555ROW_SSSE3 +ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TORGB565ROW_SSSE3 +ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +#endif +#ifdef HAS_I422TORGB24ROW_SSSE3 +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) +#endif #ifdef HAS_I422TOAR30ROW_SSSE3 ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) #endif @@ -170,18 +358,16 @@ ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) -ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) -ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) -#endif // HAS_I444TOARGBROW_SSSE3 +#endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif +#ifdef HAS_I422TOARGBROW_AVX512BW +ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31) +#endif #ifdef HAS_I422TORGBAROW_AVX2 ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #endif @@ -215,6 +401,17 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_LASX +ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31) +ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31) +ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31) +ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31) +#endif +#ifdef HAS_I444TOARGBROW_LSX +ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15) +#endif #undef ANY31C // Any 3 planes of 16 bit to 1 with yuvconstants @@ -250,24 +447,112 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_I410TOAR30ROW_SSSE3 +ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_SSSE3 +ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_AVX2 +ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I410TOAR30ROW_AVX2 +ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_SSSE3 +ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_SSSE3 +ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_AVX2 +ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_AVX2 +ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif #undef ANY31CT +// Any 3 planes to 1 plane with parameter +#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 3]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEXR30ROW_AVX2 +ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEXR30ROW_NEON +ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) +ANY31PT(MergeXR30Row_10_Any_NEON, + MergeXR30Row_10_NEON, + uint16_t, + 2, + uint8_t, + 4, + 3) +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEXR64ROW_NEON +ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +ANY31PT(MergeXRGB16To8Row_Any_AVX2, + MergeXRGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_NEON +ANY31PT(MergeXRGB16To8Row_Any_NEON, + MergeXRGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY31PT + // Any 2 planes to 1. #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ } \ memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ } // Merge functions. @@ -283,12 +568,15 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #ifdef HAS_MERGEUVROW_MSA ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) #endif -#ifdef HAS_MERGEUVROW_MMI -ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) +#ifdef HAS_MERGEUVROW_LSX +ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15) #endif #ifdef HAS_NV21TOYUV24ROW_NEON ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) #endif +#ifdef HAS_NV21TOYUV24ROW_SSSE3 +ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) +#endif #ifdef HAS_NV21TOYUV24ROW_AVX2 ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) #endif @@ -323,20 +611,20 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif -#ifdef HAS_ARGBMULTIPLYROW_MMI -ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) +#ifdef HAS_ARGBMULTIPLYROW_LASX +ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif -#ifdef HAS_ARGBADDROW_MMI -ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) +#ifdef HAS_ARGBADDROW_LASX +ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif -#ifdef HAS_ARGBSUBTRACTROW_MMI -ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) +#ifdef HAS_ARGBSUBTRACTROW_LASX +ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) @@ -347,8 +635,8 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELROW_MSA ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) #endif -#ifdef HAS_SOBELROW_MMI -ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7) +#ifdef HAS_SOBELROW_LSX +ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) @@ -359,8 +647,8 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #ifdef HAS_SOBELTOPLANEROW_MSA ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) #endif -#ifdef HAS_SOBELTOPLANEROW_MMI -ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7) +#ifdef HAS_SOBELTOPLANEROW_LSX +ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31) #endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) @@ -371,8 +659,8 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #ifdef HAS_SOBELXYROW_MSA ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) #endif -#ifdef HAS_SOBELXYROW_MMI -ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7) +#ifdef HAS_SOBELXYROW_LSX +ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) #endif #undef ANY21 @@ -407,6 +695,12 @@ ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #ifdef HAS_NV12TOARGBROW_MSA ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_LSX +ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_LASX +ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -419,6 +713,12 @@ ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #ifdef HAS_NV21TOARGBROW_MSA ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_LSX +ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV21TOARGBROW_LASX +ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif #ifdef HAS_NV12TORGB24ROW_NEON ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif @@ -449,8 +749,85 @@ ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #ifdef HAS_NV12TORGB565ROW_MSA ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_LSX +ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_LASX +ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) +#endif #undef ANY21C +// Any 2 planes of 16 bit to 1 with yuvconstants +#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ + ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_P210TOAR30ROW_SSSE3 +ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_SSSE3 +ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_AVX2 +ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P210TOAR30ROW_AVX2 +ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_SSSE3 +ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_SSSE3 +ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_AVX2 +ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_AVX2 +ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif + +#undef ANY21CT + +// Any 2 16 bit planes with parameter to 1 +#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ + } \ + memcpy(temp, src_u + n, r * BPP); \ + memcpy(temp + 16, src_v + n, r * BPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \ + memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \ + } + +#ifdef HAS_MERGEUVROW_16_AVX2 +ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_16_NEON +ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + // Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ @@ -516,12 +893,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif -#if defined(HAS_I400TOARGBROW_AVX2) -ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) -#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) @@ -529,6 +900,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #endif +#if defined(HAS_RAWTORGBAROW_SSSE3) +ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) +#endif #if defined(HAS_RAWTORGB24ROW_SSSE3) ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) #endif @@ -542,13 +916,12 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) -ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) -ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif #if defined(HAS_ARGBTORGB24ROW_MSA) ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) @@ -557,16 +930,16 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) -ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) #endif -#if defined(HAS_ARGBTORGB24ROW_MMI) -ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) -ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3) -ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) -ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) -ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7) +#if defined(HAS_ARGBTORGB24ROW_LASX) +ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31) +ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31) +ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15) +#endif +#if defined(HAS_J400TOARGBROW_LSX) +ANY11(J400ToARGBRow_Any_LSX, J400ToARGBRow_LSX, 0, 1, 4, 15) #endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) @@ -574,15 +947,21 @@ ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #if defined(HAS_RAWTORGB24ROW_MSA) ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) #endif -#if defined(HAS_RAWTORGB24ROW_MMI) -ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) +#if defined(HAS_RAWTORGB24ROW_LSX) +ANY11(RAWToRGB24Row_Any_LSX, RAWToRGB24Row_LSX, 0, 3, 3, 15) #endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif +#ifdef HAS_ABGRTOYROW_AVX2 +ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBTOYJROW_AVX2 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif +#ifdef HAS_RGBATOYJROW_AVX2 +ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_UYVYTOYROW_AVX2 ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) #endif @@ -596,74 +975,109 @@ ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_SSE2 ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_SSSE3 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYJROW_SSSE3 +ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYROW_NEON -ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif -#ifdef HAS_ARGBTOYROW_MMI -ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) +#ifdef HAS_ARGBTOYROW_LASX +ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_ARGBTOYJROW_NEON -ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_NEON +ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #endif -#ifdef HAS_ARGBTOYJROW_MMI -ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7) +#ifdef HAS_ARGBTOYJROW_LSX +ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_LASX +ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_BGRATOYROW_NEON -ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #endif -#ifdef HAS_BGRATOYROW_MMI -ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7) +#ifdef HAS_BGRATOYROW_LSX +ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_ABGRTOYROW_NEON -ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #endif -#ifdef HAS_ABGRTOYROW_MMI -ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7) +#ifdef HAS_ABGRTOYROW_LSX +ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYROW_NEON -ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #endif -#ifdef HAS_RGBATOYROW_MMI -ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) +#ifdef HAS_RGBATOYROW_LSX +ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_RGB24TOYROW_NEON -ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_AVX2 +ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RGB24TOYJROW_SSSE3 +ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_NEON +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #endif -#ifdef HAS_RGB24TOYROW_MMI -ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) +#ifdef HAS_RGB24TOYROW_LSX +ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYROW_LASX +ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYROW_NEON -ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_AVX2 +ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) +#endif +#ifdef HAS_RAWTOYJROW_SSSE3 +ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_NEON +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #endif -#ifdef HAS_RAWTOYROW_MMI -ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7) +#ifdef HAS_RAWTOYROW_LSX +ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYROW_LASX +ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) @@ -671,8 +1085,11 @@ ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #ifdef HAS_RGB565TOYROW_MSA ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) #endif -#ifdef HAS_RGB565TOYROW_MMI -ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7) +#ifdef HAS_RGB565TOYROW_LSX +ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15) +#endif +#ifdef HAS_RGB565TOYROW_LASX +ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31) #endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) @@ -680,15 +1097,15 @@ ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #ifdef HAS_ARGB1555TOYROW_MSA ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) #endif -#ifdef HAS_ARGB1555TOYROW_MMI -ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7) +#ifdef HAS_ARGB1555TOYROW_LSX +ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15) +#endif +#ifdef HAS_ARGB1555TOYROW_LASX +ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31) #endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif -#ifdef HAS_ARGB4444TOYROW_MMI -ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7) -#endif #ifdef HAS_YUY2TOYROW_NEON ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif @@ -698,20 +1115,26 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif -#ifdef HAS_YUY2TOYROW_MMI -ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) +#ifdef HAS_YUY2TOYROW_LASX +ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) #endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif -#ifdef HAS_UYVYTOYROW_MMI -ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) +#ifdef HAS_UYVYTOYROW_LASX +ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) #endif #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif -#ifdef HAS_AYUVTOYROW_NEON -ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15) +#ifdef HAS_SWAPUVROW_SSSE3 +ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) +#endif +#ifdef HAS_SWAPUVROW_AVX2 +ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) +#endif +#ifdef HAS_SWAPUVROW_NEON +ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) #endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) @@ -719,17 +1142,26 @@ ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #ifdef HAS_RGB24TOARGBROW_MSA ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) #endif -#ifdef HAS_RGB24TOARGBROW_MMI -ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3) +#ifdef HAS_RGB24TOARGBROW_LSX +ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_LASX +ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31) #endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RAWTORGBAROW_NEON +ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7) +#endif #ifdef HAS_RAWTOARGBROW_MSA ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) #endif -#ifdef HAS_RAWTOARGBROW_MMI -ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3) +#ifdef HAS_RAWTOARGBROW_LSX +ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15) +#endif +#ifdef HAS_RAWTOARGBROW_LASX +ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31) #endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) @@ -737,8 +1169,11 @@ ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #ifdef HAS_RGB565TOARGBROW_MSA ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) #endif -#ifdef HAS_RGB565TOARGBROW_MMI -ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3) +#ifdef HAS_RGB565TOARGBROW_LSX +ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_RGB565TOARGBROW_LASX +ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31) #endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) @@ -746,8 +1181,11 @@ ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) #endif -#ifdef HAS_ARGB1555TOARGBROW_MMI -ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3) +#ifdef HAS_ARGB1555TOARGBROW_LSX +ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB1555TOARGBROW_LASX +ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31) #endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) @@ -755,8 +1193,11 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #ifdef HAS_ARGB4444TOARGBROW_MSA ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) #endif -#ifdef HAS_ARGB4444TOARGBROW_MMI -ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3) +#ifdef HAS_ARGB4444TOARGBROW_LSX +ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15) +#endif +#ifdef HAS_ARGB4444TOARGBROW_LASX +ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31) #endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) @@ -776,8 +1217,8 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif -#ifdef HAS_ARGBATTENUATEROW_MMI -ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) +#ifdef HAS_ARGBATTENUATEROW_LASX +ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) @@ -791,8 +1232,8 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #ifdef HAS_ARGBEXTRACTALPHAROW_MSA ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) #endif -#ifdef HAS_ARGBEXTRACTALPHAROW_MMI -ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7) +#ifdef HAS_ARGBEXTRACTALPHAROW_LSX +ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15) #endif #undef ANY11 @@ -818,18 +1259,12 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #ifdef HAS_ARGBCOPYALPHAROW_SSE2 ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) #endif -#ifdef HAS_ARGBCOPYALPHAROW_MMI -ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1) -#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) #endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI -ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) -#endif #undef ANY11B // Any 1 to 1 with parameter. @@ -847,6 +1282,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11P(I400ToARGBRow_Any_SSE2, + I400ToARGBRow_SSE2, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11P(I400ToARGBRow_Any_AVX2, + I400ToARGBRow_AVX2, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_NEON) +ANY11P(I400ToARGBRow_Any_NEON, + I400ToARGBRow_NEON, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_MSA) +ANY11P(I400ToARGBRow_Any_MSA, + I400ToARGBRow_MSA, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_LSX) +ANY11P(I400ToARGBRow_Any_LSX, + I400ToARGBRow_LSX, + const struct YuvConstants*, + 1, + 4, + 15) +#endif + #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, @@ -879,13 +1355,13 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA, 2, 7) #endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) -ANY11P(ARGBToRGB565DitherRow_Any_MMI, - ARGBToRGB565DitherRow_MMI, +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) +ANY11P(ARGBToRGB565DitherRow_Any_LASX, + ARGBToRGB565DitherRow_LASX, const uint32_t, 4, 2, - 3) + 15) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) @@ -899,12 +1375,78 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #ifdef HAS_ARGBSHUFFLEROW_MSA ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif -#ifdef HAS_ARGBSHUFFLEROW_MMI -ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) +#ifdef HAS_ARGBSHUFFLEROW_LASX +ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) #endif #undef ANY11P #undef ANY11P +// Any 1 to 1 with type +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ + memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ + ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ + memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_ARGBTOAB64ROW_SSSE3 +ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_AR64TOARGBROW_SSSE3 +ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_NEON +ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_NEON +ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#undef ANY11T + // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ @@ -939,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2, uint8_t, 31) #endif +#ifdef HAS_CONVERT16TO8ROW_NEON +ANY11C(Convert16To8Row_Any_NEON, + Convert16To8Row_NEON, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif #ifdef HAS_CONVERT8TO16ROW_SSE2 ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, @@ -957,6 +1508,30 @@ ANY11C(Convert8To16Row_Any_AVX2, uint16_t, 31) #endif +#ifdef HAS_MULTIPLYROW_16_AVX2 +ANY11C(MultiplyRow_16_Any_AVX2, + MultiplyRow_16_AVX2, + 2, + 2, + uint16_t, + uint16_t, + 31) +#endif +#ifdef HAS_MULTIPLYROW_16_NEON +ANY11C(MultiplyRow_16_Any_NEON, + MultiplyRow_16_NEON, + 2, + 2, + uint16_t, + uint16_t, + 15) +#endif +#ifdef HAS_DIVIDEROW_16_AVX2 +ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) +#endif +#ifdef HAS_DIVIDEROW_16_NEON +ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) +#endif #undef ANY11C // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. @@ -1007,6 +1582,9 @@ ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) #ifdef HAS_BYTETOFLOATROW_NEON ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) #endif +#ifdef HAS_HALFFLOATROW_LSX +ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31) +#endif #undef ANY11P16 // Any 1 to 1 with yuvconstants @@ -1040,41 +1618,107 @@ ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) #endif +#if defined(HAS_YUY2TOARGBROW_LSX) +ANY11C(YUY2ToARGBRow_Any_LSX, YUY2ToARGBRow_LSX, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) +#endif #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS temps[64 * 2]); \ + SIMD_ALIGNED(TD tempd[64]); \ + memset(temps, 0, sizeof(temps)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ } #ifdef HAS_INTERPOLATEROW_AVX2 -ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) +ANY11I(InterpolateRow_Any_SSSE3, + InterpolateRow_SSSE3, + uint8_t, + uint8_t, + 1, + 1, + 15) #endif #ifdef HAS_INTERPOLATEROW_NEON -ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) +ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MSA -ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) +ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31) #endif -#ifdef HAS_INTERPOLATEROW_MMI -ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) +#ifdef HAS_INTERPOLATEROW_LSX +ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31) #endif -#undef ANY11T + +#ifdef HAS_INTERPOLATEROW_16_NEON +ANY11I(InterpolateRow_16_Any_NEON, + InterpolateRow_16_NEON, + uint16_t, + uint16_t, + 1, + 1, + 7) +#endif +#undef ANY11I + +// Any 1 to 1 interpolate with scale param +#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ + void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ + int scale, int width, int source_y_fraction) { \ + SIMD_ALIGNED(TS temps[64 * 2]); \ + SIMD_ALIGNED(TD tempd[64]); \ + memset(temps, 0, sizeof(temps)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \ + } \ + memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + if (source_y_fraction) { \ + memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + r * SBPP * sizeof(TS)); \ + } \ + ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ + } + +#ifdef HAS_INTERPOLATEROW_16TO8_NEON +ANY11IS(InterpolateRow_16To8_Any_NEON, + InterpolateRow_16To8_NEON, + uint8_t, + uint16_t, + 1, + 1, + 7) +#endif +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +ANY11IS(InterpolateRow_16To8_Any_AVX2, + InterpolateRow_16To8_AVX2, + uint8_t, + uint16_t, + 1, + 1, + 31) +#endif + +#undef ANY11IS // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ @@ -1098,13 +1742,28 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif #ifdef HAS_MIRRORROW_NEON -ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif -#ifdef HAS_MIRRORROW_MMI -ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) +#ifdef HAS_MIRRORROW_LASX +ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) +#endif +#ifdef HAS_MIRRORUVROW_AVX2 +ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) +#endif +#ifdef HAS_MIRRORUVROW_SSSE3 +ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_NEON +ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) +#endif +#ifdef HAS_MIRRORUVROW_MSA +ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_LASX +ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) #endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) @@ -1113,13 +1772,19 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #endif #ifdef HAS_ARGBMIRRORROW_NEON -ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif -#ifdef HAS_ARGBMIRRORROW_MMI -ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) +#ifdef HAS_ARGBMIRRORROW_LASX +ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_SSSE3 +ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_NEON +ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) #endif #undef ANY11M @@ -1127,6 +1792,7 @@ ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ SIMD_ALIGNED(uint8_t temp[64]); \ + memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ @@ -1142,12 +1808,18 @@ ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) #ifdef HAS_SETROW_NEON ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) #endif +#ifdef HAS_SETROW_LSX +ANY1(SetRow_Any_LSX, SetRow_LSX, uint8_t, 1, 15) +#endif #ifdef HAS_ARGBSETROW_NEON ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) #endif #ifdef HAS_ARGBSETROW_MSA ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) #endif +#ifdef HAS_ARGBSETROW_LSX +ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) +#endif #undef ANY1 // Any 1 to 2. Outputs UV planes. @@ -1179,8 +1851,8 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #ifdef HAS_SPLITUVROW_MSA ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) #endif -#ifdef HAS_SPLITUVROW_MMI -ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7) +#ifdef HAS_SPLITUVROW_LSX +ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) @@ -1203,13 +1875,39 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif -#ifdef HAS_YUY2TOUV422ROW_MMI -ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) -ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) -ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) +#ifdef HAS_YUY2TOUV422ROW_LASX +ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) +ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31) #endif #undef ANY12 +// Any 2 16 bit planes with parameter to 1 +#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ + } \ + memcpy(temp, src_uv + n * 2, r * BPP * 2); \ + ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \ + memcpy(dst_u + n, temp + 32, r * BPP); \ + memcpy(dst_v + n, temp + 48, r * BPP); \ + } + +#ifdef HAS_SPLITUVROW_16_AVX2 +ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15) +#endif + +#ifdef HAS_SPLITUVROW_16_NEON +ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + // Any 1 to 3. Outputs RGB planes. #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ @@ -1234,24 +1932,66 @@ ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) #ifdef HAS_SPLITRGBROW_NEON ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #endif -#ifdef HAS_SPLITRGBROW_MMI -ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) +#ifdef HAS_SPLITXRGBROW_SSE2 +ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) +#endif +#ifdef HAS_SPLITXRGBROW_SSSE3 +ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) +#endif +#ifdef HAS_SPLITXRGBROW_AVX2 +ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) +#endif +#ifdef HAS_SPLITXRGBROW_NEON +ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) +#endif + +// Any 1 to 4. Outputs ARGB planes. +#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, uint8_t* dst_a, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 8]); \ + memset(temp, 0, 16 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ + } \ + memcpy(temp, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \ + MASK + 1); \ + memcpy(dst_r + n, temp + 16 * 4, r); \ + memcpy(dst_g + n, temp + 16 * 5, r); \ + memcpy(dst_b + n, temp + 16 * 6, r); \ + memcpy(dst_a + n, temp + 16 * 7, r); \ + } + +#ifdef HAS_SPLITARGBROW_SSE2 +ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) +#endif +#ifdef HAS_SPLITARGBROW_SSSE3 +ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) +#endif +#ifdef HAS_SPLITARGBROW_AVX2 +ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) +#endif +#ifdef HAS_SPLITARGBROW_NEON +ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) #endif // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ @@ -1267,6 +2007,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVROW_AVX2 +ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif @@ -1291,8 +2034,8 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif -#ifdef HAS_ARGBTOUVROW_MMI -ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) +#ifdef HAS_ARGBTOUVROW_LASX +ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) @@ -1300,53 +2043,68 @@ ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif -#ifdef HAS_ARGBTOUVJROW_MMI -ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) +#ifdef HAS_ARGBTOUVJROW_LSX +ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_LASX +ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) #endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA -ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif -#ifdef HAS_BGRATOUVROW_MMI -ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) +#ifdef HAS_BGRATOUVROW_LSX +ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA -ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif -#ifdef HAS_ABGRTOUVROW_MMI -ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) +#ifdef HAS_ABGRTOUVROW_LSX +ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA -ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif -#ifdef HAS_RGBATOUVROW_MMI -ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) +#ifdef HAS_RGBATOUVROW_LSX +ANY12S(RGBAToUVRow_Any_LSX, RGBAToUVRow_LSX, 0, 4, 15) #endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVJROW_NEON +ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15) +#endif #ifdef HAS_RGB24TOUVROW_MSA ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) #endif -#ifdef HAS_RGB24TOUVROW_MMI -ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15) +#ifdef HAS_RGB24TOUVROW_LSX +ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15) +#endif +#ifdef HAS_RGB24TOUVROW_LASX +ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31) #endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVJROW_NEON +ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_MSA ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) #endif -#ifdef HAS_RAWTOUVROW_MMI -ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15) +#ifdef HAS_RAWTOUVROW_LSX +ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_LASX +ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31) #endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) @@ -1354,8 +2112,11 @@ ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #ifdef HAS_RGB565TOUVROW_MSA ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) #endif -#ifdef HAS_RGB565TOUVROW_MMI -ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15) +#ifdef HAS_RGB565TOUVROW_LSX +ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15) +#endif +#ifdef HAS_RGB565TOUVROW_LASX +ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31) #endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) @@ -1363,15 +2124,15 @@ ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #ifdef HAS_ARGB1555TOUVROW_MSA ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) #endif -#ifdef HAS_ARGB1555TOUVROW_MMI -ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15) +#ifdef HAS_ARGB1555TOUVROW_LSX +ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_LASX +ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31) #endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif -#ifdef HAS_ARGB4444TOUVROW_MMI -ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15) -#endif #ifdef HAS_YUY2TOUVROW_NEON ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #endif @@ -1381,31 +2142,31 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif -#ifdef HAS_YUY2TOUVROW_MMI -ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) +#ifdef HAS_YUY2TOUVROW_LASX +ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) #endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif -#ifdef HAS_UYVYTOUVROW_MMI -ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) +#ifdef HAS_UYVYTOUVROW_LASX +ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) #endif #undef ANY12S // Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. // 128 byte row allows for 32 avx ARGB pixels. #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \ + ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ @@ -1423,6 +2184,51 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif #undef ANY11S +#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 2]); \ + memset(temp, 0, 16); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src, src_tile_stride, dst, n); \ + } \ + memcpy(temp, src + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \ + memcpy(dst + n, temp + 16, r); \ + } + +#ifdef HAS_DETILEROW_NEON +ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15) +#endif +#ifdef HAS_DETILEROW_SSE2 +ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15) +#endif + +#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 2]); \ + memset(temp, 0, 16 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_uv + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r); \ + memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2); \ + memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2); \ + } + +#ifdef HAS_DETILESPLITUVROW_NEON +ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) +#endif +#ifdef HAS_DETILESPLITUVROW_SSSE3 +ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_common.cc b/files/source/row_common.cc index 8951d003..83442496 100644 --- a/files/source/row_common.cc +++ b/files/source/row_common.cc @@ -10,34 +10,67 @@ #include "libyuv/row.h" -#include <stdio.h> +#include <assert.h> #include <string.h> // For memcpy and memset. #include "libyuv/basic_types.h" +#include "libyuv/convert_argb.h" // For kYuvI601Constants #ifdef __cplusplus namespace libyuv { extern "C" { #endif +// This macro controls YUV to RGB using unsigned math to extend range of +// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: +// LIBYUV_UNLIMITED_DATA + +// Macros to enable unlimited data for each colorspace +// LIBYUV_UNLIMITED_BT601 +// LIBYUV_UNLIMITED_BT709 +// LIBYUV_UNLIMITED_BT2020 + +// The following macro from row_win makes the C code match the row_win code, +// which is 7 bit fixed point for ARGBToI420: +#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ + defined(_MSC_VER) && !defined(__clang__) && \ + (defined(_M_IX86) || defined(_M_X64)) +#define LIBYUV_RGB7 1 +#endif + +#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86)) +#define LIBYUV_ARGBTOUV_PAVGB 1 +#define LIBYUV_RGBTOU_TRUNCATE 1 +#define LIBYUV_ATTENUATE_DUP 1 +#endif +#if defined(LIBYUV_BIT_EXACT) +#define LIBYUV_UNATTENUATE_DUP 1 +#endif + // llvm x86 is poor at ternary operator, so use branchless min/max. #define USE_BRANCHLESS 1 #if USE_BRANCHLESS static __inline int32_t clamp0(int32_t v) { - return ((-(v) >> 31) & (v)); + return -(v >= 0) & v; } - +// TODO(fbarchard): make clamp255 preserve negative values. static __inline int32_t clamp255(int32_t v) { - return (((255 - (v)) >> 31) | (v)) & 255; + return (-(v >= 255) | v) & 255; } static __inline int32_t clamp1023(int32_t v) { - return (((1023 - (v)) >> 31) | (v)) & 1023; + return (-(v >= 1023) | v) & 1023; +} + +// clamp to max +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (-(v >= max) | v) & max; } static __inline uint32_t Abs(int32_t v) { - int m = v >> 31; + int m = -(v < 0); return (v + m) ^ m; } #else // USE_BRANCHLESS @@ -53,6 +86,10 @@ static __inline int32_t clamp1023(int32_t v) { return (v > 1023) ? 1023 : v; } +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (v > max) ? max : v; +} + static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } @@ -111,6 +148,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } } +void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; + dst_rgba[0] = 255u; + dst_rgba[1] = b; + dst_rgba[2] = g; + dst_rgba[3] = r; + dst_rgba += 4; + src_raw += 3; + } +} + void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; for (x = 0; x < width; ++x) { @@ -181,7 +233,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; @@ -195,7 +248,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; @@ -209,7 +263,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t ar30; + memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = ar30 & 0x3ff; uint32_t ga = ar30 & 0xc00ffc00; uint32_t r = (ar30 >> 20) & 0x3ff; @@ -291,8 +346,8 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | - (r1 << 27)); + *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11); dst_rgb += 4; src_argb += 8; } @@ -316,8 +371,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g1 = src_argb[5] >> 3; uint8_t r1 = src_argb[6] >> 3; uint8_t a1 = src_argb[7] >> 7; - *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15); dst_rgb += 4; src_argb += 8; } @@ -341,8 +396,8 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g1 = src_argb[5] >> 4; uint8_t r1 = src_argb[6] >> 4; uint8_t a1 = src_argb[7] >> 4; - *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12); dst_rgb += 4; src_argb += 8; } @@ -381,56 +436,208 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { } } +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ar64[0] = src_argb[0] * 0x0101; + dst_ar64[1] = src_argb[1] * 0x0101; + dst_ar64[2] = src_argb[2] * 0x0101; + dst_ar64[3] = src_argb[3] * 0x0101; + dst_ar64 += 4; + src_argb += 4; + } +} + +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ab64[0] = src_argb[2] * 0x0101; + dst_ab64[1] = src_argb[1] * 0x0101; + dst_ab64[2] = src_argb[0] * 0x0101; + dst_ab64[3] = src_argb[3] * 0x0101; + dst_ab64 += 4; + src_argb += 4; + } +} + +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ar64[0] >> 8; + dst_argb[1] = src_ar64[1] >> 8; + dst_argb[2] = src_ar64[2] >> 8; + dst_argb[3] = src_ar64[3] >> 8; + dst_argb += 4; + src_ar64 += 4; + } +} + +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ab64[2] >> 8; + dst_argb[1] = src_ab64[1] >> 8; + dst_argb[2] = src_ab64[0] >> 8; + dst_argb[3] = src_ab64[3] >> 8; + dst_argb += 4; + src_ab64 += 4; + } +} + +// TODO(fbarchard): Make shuffle compatible with SIMD versions +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width) { + const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; + uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; + int index0 = shuffler[0] / 2; + int index1 = shuffler[2] / 2; + int index2 = shuffler[4] / 2; + int index3 = shuffler[6] / 2; + // Shuffle a row of AR64. + int x; + for (x = 0; x < width / 2; ++x) { + // To support in-place conversion. + uint16_t b = src_ar64_16[index0]; + uint16_t g = src_ar64_16[index1]; + uint16_t r = src_ar64_16[index2]; + uint16_t a = src_ar64_16[index3]; + dst_ar64_16[0] = b; + dst_ar64_16[1] = g; + dst_ar64_16[2] = r; + dst_ar64_16[3] = a; + src_ar64_16 += 4; + dst_ar64_16 += 4; + } +} + +#ifdef LIBYUV_RGB7 +// Old 7 bit math for compatibility on unsupported platforms. +static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return ((33 * r + 65 * g + 13 * b) >> 7) + 16; +} +#else +// 8 bit +// Intel SSE/AVX uses the following equivalent formula +// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. +// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + +// 0x7e80) >> 8; + static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; } +#endif + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) +// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. +#ifdef LIBYUV_RGBTOU_TRUNCATE +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; +} +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; +} +#else +// TODO(fbarchard): Add rounding to x86 SIMD and use this static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } +#endif + +// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. +#if !defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; +} +static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; +} +#endif // ARGBToY_C and ARGBToUV_C -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP]) >> \ - 2; \ - uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP]) >> \ - 2; \ - uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP]) >> \ - 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ + } +#endif MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -448,14 +655,14 @@ MAKEROWY(RAW, 0, 1, 2, 3) // b 0.1016 * 255 = 25.908 = 25 // g 0.5078 * 255 = 129.489 = 129 // r 0.2578 * 255 = 65.739 = 66 -// JPeg 8 bit Y (not used): -// b 0.11400 * 256 = 29.184 = 29 -// g 0.58700 * 256 = 150.272 = 150 -// r 0.29900 * 256 = 76.544 = 77 -// JPeg 7 bit Y: +// JPeg 7 bit Y (deprecated) // b 0.11400 * 128 = 14.592 = 15 // g 0.58700 * 128 = 75.136 = 75 // r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit Y: +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 // JPeg 8 bit U: // b 0.50000 * 255 = 127.5 = 127 // g -0.33126 * 255 = -84.4713 = -84 @@ -465,57 +672,119 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 +#ifdef LIBYUV_RGB7 +// Old 7 bit math for compatibility on unsupported platforms. static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (38 * r + 75 * g + 15 * b + 64) >> 7; } +#else +// 8 bit +static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (77 * r + 150 * g + 29 * b + 128) >> 8; +} +#endif +#if defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } - -#define AVGB(a, b) (((a) + (b) + 1) >> 1) +#else +static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; +} +static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { + return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; +} +#endif // ARGBToYJ_C and ARGBToUVJ_C -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ + } + +#endif MAKEROWYJ(ARGB, 2, 1, 0, 4) +MAKEROWYJ(RGBA, 3, 2, 1, 4) +MAKEROWYJ(RGB24, 2, 1, 0, 3) +MAKEROWYJ(RAW, 0, 1, 2, 3) #undef MAKEROWYJ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { @@ -583,13 +852,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, uint8_t b3 = next_rgb565[2] & 0x1f; uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); uint8_t r3 = next_rgb565[3] >> 3; - uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 1) | (b >> 6); // 787 -> 888. - r = (r << 1) | (r >> 6); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 2) | (g0 >> 4); + r0 = (r0 << 3) | (r0 >> 2); + b1 = (b1 << 3) | (b1 >> 2); + g1 = (g1 << 2) | (g1 >> 4); + r1 = (r1 << 3) | (r1 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 2) | (g2 >> 4); + r2 = (r2 << 3) | (r2 >> 2); + b3 = (b3 << 3) | (b3 >> 2); + g3 = (g3 << 2) | (g3 >> 4); + r3 = (r3 << 3) | (r3 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_rgb565 += 4; next_rgb565 += 4; dst_u += 1; @@ -602,14 +892,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, uint8_t b2 = next_rgb565[0] & 0x1f; uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); uint8_t r2 = next_rgb565[1] >> 3; - uint8_t b = (b0 + b2); // 565 * 2 = 676. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 2) | (b >> 4); // 676 -> 888 - g = (g << 1) | (g >> 6); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 2) | (g0 >> 4); + r0 = (r0 << 3) | (r0 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 2) | (g2 >> 4); + r2 = (r2 << 3) | (r2 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -633,14 +936,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, uint8_t b3 = next_argb1555[2] & 0x1f; uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; - uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 1) | (b >> 6); // 777 -> 888. - g = (g << 1) | (g >> 6); - r = (r << 1) | (r >> 6); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 3) | (g0 >> 2); + r0 = (r0 << 3) | (r0 >> 2); + b1 = (b1 << 3) | (b1 >> 2); + g1 = (g1 << 3) | (g1 >> 2); + r1 = (r1 << 3) | (r1 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + r2 = (r2 << 3) | (r2 >> 2); + b3 = (b3 << 3) | (b3 >> 2); + g3 = (g3 << 3) | (g3 >> 2); + r3 = (r3 << 3) | (r3 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_argb1555 += 4; next_argb1555 += 4; dst_u += 1; @@ -652,15 +975,28 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; uint8_t b2 = next_argb1555[0] & 0x1f; uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8_t r2 = next_argb1555[1] >> 3; - uint8_t b = (b0 + b2); // 555 * 2 = 666. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 2) | (b >> 4); // 666 -> 888. - g = (g << 2) | (g >> 4); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; + + b0 = (b0 << 3) | (b0 >> 2); + g0 = (g0 << 3) | (g0 >> 2); + r0 = (r0 << 3) | (r0 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + r2 = (r2 << 3) | (r2 >> 2); + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -684,14 +1020,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t b3 = next_argb4444[2] & 0x0f; uint8_t g3 = next_argb4444[2] >> 4; uint8_t r3 = next_argb4444[3] & 0x0f; - uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. - uint8_t g = (g0 + g1 + g2 + g3); - uint8_t r = (r0 + r1 + r2 + r3); - b = (b << 2) | (b >> 4); // 666 -> 888. - g = (g << 2) | (g >> 4); - r = (r << 2) | (r >> 4); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 4) | b0; + g0 = (g0 << 4) | g0; + r0 = (r0 << 4) | r0; + b1 = (b1 << 4) | b1; + g1 = (g1 << 4) | g1; + r1 = (r1 << 4) | r1; + b2 = (b2 << 4) | b2; + g2 = (g2 << 4) | g2; + r2 = (r2 << 4) | r2; + b3 = (b3 << 4) | b3; + g3 = (g3 << 4) | g3; + r3 = (r3 << 4) | r3; + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; + uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; + uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif + src_argb4444 += 4; next_argb4444 += 4; dst_u += 1; @@ -704,14 +1060,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t b2 = next_argb4444[0] & 0x0f; uint8_t g2 = next_argb4444[0] >> 4; uint8_t r2 = next_argb4444[1] & 0x0f; - uint8_t b = (b0 + b2); // 444 * 2 = 555. - uint8_t g = (g0 + g2); - uint8_t r = (r0 + r2); - b = (b << 3) | (b >> 2); // 555 -> 888. - g = (g << 3) | (g >> 2); - r = (r << 3) | (r >> 2); - dst_u[0] = RGBToU(r, g, b); - dst_v[0] = RGBToV(r, g, b); + + b0 = (b0 << 4) | b0; + g0 = (g0 << 4) | g0; + r0 = (r0 << 4) | r0; + b2 = (b2 << 4) | b2; + g2 = (g2 << 4) | g2; + r2 = (r2 << 4) | r2; + +#if LIBYUV_ARGBTOUV_PAVGB + uint8_t ab = AVGB(b0, b2); + uint8_t ag = AVGB(g0, g2); + uint8_t ar = AVGB(r0, r2); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); +#else + uint16_t b = b0 + b2; + uint16_t g = g0 + g2; + uint16_t r = r0 + r2; + dst_u[0] = RGB2xToU(r, g, b); + dst_v[0] = RGB2xToV(r, g, b); +#endif } } @@ -877,16 +1246,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8_t* src_argb0, +void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const uint32_t b = REPEAT8(src_argb0[0]); - const uint32_t g = REPEAT8(src_argb0[1]); - const uint32_t r = REPEAT8(src_argb0[2]); - const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); const uint32_t b_scale = src_argb1[0]; const uint32_t g_scale = src_argb1[1]; const uint32_t r_scale = src_argb1[2]; @@ -895,7 +1264,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -905,16 +1274,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8_t* src_argb0, +void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_add = src_argb1[0]; const int g_add = src_argb1[1]; const int r_add = src_argb1[2]; @@ -923,7 +1292,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_add); dst_argb[2] = SHADE(r, r_add); dst_argb[3] = SHADE(a, a_add); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -932,16 +1301,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8_t* src_argb0, +void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_sub = src_argb1[0]; const int g_sub = src_argb1[1]; const int r_sub = src_argb1[2]; @@ -950,7 +1319,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_sub); dst_argb[2] = SHADE(r, r_sub); dst_argb[3] = SHADE(a, a_sub); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1058,257 +1427,244 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { } } -// TODO(fbarchard): Unify these structures to be platform independent. -// TODO(fbarchard): Generate SIMD structures from float matrix. +// Macros to create SIMD specific yuv to rgb conversion constants. -// BT.601 YUV to RGB reference -// R = (Y - 16) * 1.164 - V * -1.596 -// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 -// B = (Y - 16) * 1.164 - U * -2.018 +// clang-format off -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ +#if defined(__aarch64__) || defined(__arm__) +// Bias values include subtract 128 from U and V, bias from Y and rounding. +// For B and R bias is negative. For G bias is positive. +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ + 0, 0}} +#else +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ + {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} +#endif + +// clang-format on + +#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ + const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ + const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); + +// TODO(fbarchard): Generate SIMD structures from float matrix. + +// BT.601 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 + U * 2.018 +// KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ -#define VR -102 /* round(-1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) -#define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -#if defined(__aarch64__) // 64 bit arm -const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -#elif defined(__arm__) // 32 bit arm -const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, - {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, - {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601) +#define UB 129 /* round(2.018 * 64) */ #else -const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, - {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; -const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, - {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, - {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#define UB 128 /* max(128, round(2.018 * 64)) */ #endif +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR 102 /* round(1.596 * 64) */ -#undef BB -#undef BG -#undef BR -#undef YGB +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB #undef UB #undef UG #undef VG #undef VR -#undef YG -// JPEG YUV to RGB reference -// * R = Y - V * -1.40200 -// * G = Y - U * 0.34414 - V * 0.71414 -// * B = Y - U * -1.77200 +// BT.601 full range YUV to RGB reference (aka JPEG) +// * R = Y + V * 1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y + U * 1.77200 +// KR = 0.299; KB = 0.114 + +// U and V contributions to R,G,B. +#define UB 113 /* round(1.77200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR 90 /* round(1.40200 * 64) */ // Y contribution to R,G,B. Scale and bias. #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.709 limited range YUV to RGB reference +// R = (Y - 16) * 1.164 + V * 1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 + U * 2.112 +// KR = 0.2126, KB = 0.0722 // U and V contributions to R,G,B. -#define UB -113 /* round(-1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR -90 /* round(-1.40200 * 64) */ - -// Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) -#define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -#if defined(__aarch64__) -const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -#elif defined(__arm__) -const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, - {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, - {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709) +#define UB 135 /* round(2.112 * 64) */ #else -const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, - {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; -const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, - {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, - {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#define UB 128 /* max(128, round(2.112 * 64)) */ #endif +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR 115 /* round(1.793 * 64) */ -#undef BB -#undef BG -#undef BR -#undef YGB +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB #undef UB #undef UG #undef VG #undef VR -#undef YG -// BT.709 YUV to RGB reference -// R = (Y - 16) * 1.164 - V * -1.793 -// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 -// B = (Y - 16) * 1.164 - U * -2.112 -// See also http://www.equasys.de/colorconversion.html +// BT.709 full range YUV to RGB reference +// R = Y + V * 1.5748 +// G = Y - U * 0.18732 - V * 0.46812 +// B = Y + U * 1.8556 +// KR = 0.2126, KB = 0.0722 -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ +// U and V contributions to R,G,B. +#define UB 119 /* round(1.8556 * 64) */ +#define UG 12 /* round(0.18732 * 64) */ +#define VG 30 /* round(0.46812 * 64) */ +#define VR 101 /* round(1.5748 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +// BT.2020 limited range YUV to RGB reference +// R = (Y - 16) * 1.164384 + V * 1.67867 +// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 +// B = (Y - 16) * 1.164384 + U * 2.14177 +// KR = 0.2627; KB = 0.0593 -// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.112 * 64)) */ -#define UG 14 /* round(0.213 * 64) */ -#define VG 34 /* round(0.533 * 64) */ -#define VR -115 /* round(-1.793 * 64) */ - -// Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) -#define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) - -#if defined(__aarch64__) -const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -#elif defined(__arm__) -const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, - {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; -const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, - {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020) +#define UB 137 /* round(2.142 * 64) */ #else -const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, - {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; -const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, - {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, - {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; +#define UB 128 /* max(128, round(2.142 * 64)) */ #endif +#define UG 12 /* round(0.187326 * 64) */ +#define VG 42 /* round(0.65042 * 64) */ +#define VR 107 /* round(1.67867 * 64) */ -#undef BB -#undef BG -#undef BR -#undef YGB +// Y contribution to R,G,B. Scale and bias. +#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ +#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ + +MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) + +#undef YG +#undef YB #undef UB #undef UG #undef VG #undef VR + +// BT.2020 full range YUV to RGB reference +// R = Y + V * 1.474600 +// G = Y - U * 0.164553 - V * 0.571353 +// B = Y + U * 1.881400 +// KR = 0.2627; KB = 0.0593 + +#define UB 120 /* round(1.881400 * 64) */ +#define UG 11 /* round(0.164553 * 64) */ +#define VG 37 /* round(0.571353 * 64) */ +#define VR 94 /* round(1.474600 * 64) */ + +// Y contribution to R,G,B. Scale and bias. (same as jpeg) +#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ +#define YB 32 /* 64 / 2 */ + +MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) + #undef YG +#undef YB +#undef UB +#undef UG +#undef VG +#undef VR + +#undef BB +#undef BG +#undef BR + +#undef MAKEYUVCONSTANTS + +#if defined(__aarch64__) || defined(__arm__) +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVCoeff[0]; \ + int vr = yuvconstants->kUVCoeff[1]; \ + int ug = yuvconstants->kUVCoeff[2]; \ + int vg = yuvconstants->kUVCoeff[3]; \ + int yg = yuvconstants->kRGBCoeffBias[0]; \ + int bb = yuvconstants->kRGBCoeffBias[1]; \ + int bg = yuvconstants->kRGBCoeffBias[2]; \ + int br = yuvconstants->kRGBCoeffBias[3] + +#define CALC_RGB16 \ + int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ + int b16 = y1 + (u * ub) - bb; \ + int g16 = y1 + bg - (u * ug + v * vg); \ + int r16 = y1 + (v * vr) - br +#else +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVToB[0]; \ + int ug = yuvconstants->kUVToG[0]; \ + int vg = yuvconstants->kUVToG[1]; \ + int vr = yuvconstants->kUVToR[1]; \ + int yg = yuvconstants->kYToRgb[0]; \ + int yb = yuvconstants->kYBiasToRgb[0] + +#define CALC_RGB16 \ + int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ + int8_t ui = (int8_t)u; \ + int8_t vi = (int8_t)v; \ + ui -= 0x80; \ + vi -= 0x80; \ + int b16 = y1 + (ui * ub); \ + int g16 = y1 - (ui * ug + vi * vg); \ + int r16 = y1 + (vi * vr) +#endif // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. - static __inline void YuvPixel(uint8_t y, uint8_t u, uint8_t v, @@ -1316,39 +1672,12 @@ static __inline void YuvPixel(uint8_t y, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6); - *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6); + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); } // Reads 8 bit YUV and leaves result as 16 bit. @@ -1359,85 +1688,50 @@ static __inline void YuvPixel8_16(uint8_t y, int* g, int* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 16 bit assembly. // Reads 10 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel16(int16_t y, - int16_t u, - int16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; +static __inline void YuvPixel10_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y << 6; u = clamp255(u >> 2); v = clamp255(v >> 2); - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 12 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel12_16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y << 4; + u = clamp255(u >> 4); + v = clamp255(v >> 4); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 10 bit assembly. @@ -1452,59 +1746,88 @@ static __inline void YuvPixel10(uint16_t y, int b16; int g16; int r16; - YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); *b = Clamp(b16 >> 6); *g = Clamp(g16 >> 6); *r = Clamp(r16 >> 6); } -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -// C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { - uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32_t)(y1 + YGB) >> 6); - *g = Clamp((int32_t)(y1 + YGB) >> 6); - *r = Clamp((int32_t)(y1 + YGB) >> 6); +// C reference code that mimics the YUV 12 bit assembly. +// Reads 12 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel12(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); } -#undef YG -#undef YGB +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 8 bit. +static __inline void YuvPixel16_8(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); +} -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) -// C mimic assembly. -// TODO(fbarchard): Remove subsampling from Neon. -void I444ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; - uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_u += 2; - src_v += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } + +// C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 8 bit. +static __inline void YPixel(uint8_t y, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) || defined(__arm__) + int yg = yuvconstants->kRGBCoeffBias[0]; + int ygb = yuvconstants->kRGBCoeffBias[4]; #else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp(((int32_t)(y1) + ygb) >> 6); + *g = Clamp(((int32_t)(y1) + ygb) >> 6); + *r = Clamp(((int32_t)(y1) + ygb) >> 6); +} + void I444ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1522,7 +1845,6 @@ void I444ToARGBRow_C(const uint8_t* src_y, rgb_buf += 4; // Advance 1 pixel. } } -#endif // Also used for 420 void I422ToARGBRow_C(const uint8_t* src_y, @@ -1578,9 +1900,102 @@ void I210ToARGBRow_C(const uint16_t* src_y, } } +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = clamp255(src_a[1] >> 2); + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + } +} + +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +// 12 bit YUV to ARGB +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { uint32_t ar30; - b = b >> 4; // convert 10.6 to 10 bit. + b = b >> 4; // convert 8 bit 10.6 to 10 bit. g = g >> 4; r = r >> 4; b = Clamp10(b); @@ -1602,9 +2017,9 @@ void I210ToAR30Row_C(const uint16_t* src_y, int g; int r; for (x = 0; x < width - 1; x += 2) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); - YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; @@ -1612,11 +2027,141 @@ void I210ToAR30Row_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } +// 12 bit YUV to 10 bit AR30 +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +// P210 has 10 bits in msb of 16 bit NV12 style layout. +void P210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5, + dst_argb + 6, yuvconstants); + dst_argb[7] = 255; + src_y += 2; + src_uv += 2; + dst_argb += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + } +} + +void P410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + src_y += 1; + src_uv += 2; + dst_argb += 4; // Advance 1 pixels. + } +} + +void P210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30 + 4, b, g, r); + src_y += 2; + src_uv += 2; + dst_ar30 += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + } +} + +void P410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + src_y += 1; + src_uv += 2; + dst_ar30 += 4; // Advance 1 pixel. + } +} + // 8 bit YUV to 10 bit AR30 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. void I422ToAR30Row_C(const uint8_t* src_y, @@ -1645,6 +2190,26 @@ void I422ToAR30Row_C(const uint8_t* src_y, } } +void I444AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = src_a[0]; + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + void I422AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1718,8 +2283,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | - (g1 << 20) | (r1 << 24) | 0xf000f000; + *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; + *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000; src_y += 2; src_u += 1; src_v += 1; @@ -1756,8 +2321,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | - (g1 << 21) | (r1 << 26) | 0x80008000; + *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; + *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000; src_y += 2; src_u += 1; src_v += 1; @@ -1794,8 +2359,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32_t*)(dst_rgb565) = - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); // for ubsan + *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); src_y += 2; src_u += 1; src_v += 1; @@ -1921,8 +2486,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32_t*)(dst_rgb565) = - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. @@ -2006,18 +2571,21 @@ void I422ToRGBARow_C(const uint8_t* src_y, } } -void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2035,10 +2603,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } -void MirrorUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_uv[0]; + dst_uv[1] = src_uv[1]; + src_uv -= 2; + dst_uv += 2; + } +} + +void MirrorSplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { @@ -2069,6 +2648,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { + int x; + src_rgb24 += width * 3 - 3; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + src_rgb24 -= 3; + dst_rgb24 += 3; + } +} + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -2105,6 +2699,38 @@ void MergeUVRow_C(const uint8_t* src_u, } } +void DetileRow_C(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + memcpy(dst, src, 16); + dst += 16; + src += src_tile_stride; + } + if (width & 15) { + memcpy(dst, src, width & 15); + } +} + +void DetileSplitUVRow_C(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + SplitUVRow_C(src_uv, dst_u, dst_v, 8); + dst_u += 8; + dst_v += 8; + src_uv += src_tile_stride; + } + if (width & 15) { + SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2); + } +} + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -2133,27 +2759,197 @@ void MergeRGBRow_C(const uint8_t* src_r, } } -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits +void SplitARGBRow_C(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_b[x] = src_argb[0]; + dst_g[x] = src_argb[1]; + dst_r[x] = src_argb[2]; + dst_a[x] = src_argb[3]; + src_argb += 4; + } +} + +void MergeARGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_b[x]; + dst_argb[1] = src_g[x]; + dst_argb[2] = src_r[x]; + dst_argb[3] = src_a[x]; + dst_argb += 4; + } +} + +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + assert(depth >= 10); + assert(depth <= 16); + int x; + int shift = depth - 10; + uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; + for (x = 0; x < width; ++x) { + uint32_t r = clamp1023(src_r[x] >> shift); + uint32_t g = clamp1023(src_g[x] >> shift); + uint32_t b = clamp1023(src_b[x] >> shift); + dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; + } +} + +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[3] = ClampMax(src_a[x], max) << shift; + dst_ar64 += 4; + } +} + +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = clamp255(src_a[x] >> shift); + dst_argb += 4; + } +} + +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[3] = 0xffff; + dst_ar64 += 4; + } +} + +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = 0xff; + dst_argb += 4; + } +} + +void SplitXRGBRow_C(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_b[x] = src_argb[0]; + dst_g[x] = src_argb[1]; + dst_r[x] = src_argb[2]; + src_argb += 4; + } +} + +void MergeXRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_b[x]; + dst_argb[1] = src_g[x]; + dst_argb[2] = src_r[x]; + dst_argb[3] = 255; + dst_argb += 4; + } +} + +// Convert lsb formats to msb, depending on sample depth. void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width) { + int shift = 16 - depth; + assert(depth >= 8); + assert(depth <= 16); int x; - for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = src_u[x] * scale; - dst_uv[1] = src_v[x] * scale; - dst_uv[2] = src_u[x + 1] * scale; - dst_uv[3] = src_v[x + 1] * scale; - dst_uv += 4; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_u[x] << shift; + dst_uv[1] = src_v[x] << shift; + dst_uv += 2; } - if (width & 1) { - dst_uv[0] = src_u[width - 1] * scale; - dst_uv[1] = src_v[width - 1] * scale; +} + +// Convert msb formats to lsb, depending on sample depth. +void SplitUVRow_16_C(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = 16 - depth; + int x; + assert(depth >= 8); + assert(depth <= 16); + for (x = 0; x < width; ++x) { + dst_u[x] = src_uv[0] >> shift; + dst_v[x] = src_uv[1] >> shift; + src_uv += 2; } } @@ -2167,18 +2963,34 @@ void MultiplyRow_16_C(const uint16_t* src_y, } } +void DivideRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { int x; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < width; ++x) { - dst_y[x] = clamp255((src_y[x] * scale) >> 16); + dst_y[x] = C16TO8(src_y[x], scale); } } @@ -2208,10 +3020,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) { } void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { - uint32_t* d = (uint32_t*)(dst_argb); int x; for (x = 0; x < width; ++x) { - d[x] = v32; + memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32); } } @@ -2309,21 +3120,21 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { } } -#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f +#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8_t* src_argb0, +void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; @@ -2332,10 +3143,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[2] = BLEND(fr, br, a); dst_argb[3] = 255u; - fb = src_argb0[4 + 0]; - fg = src_argb0[4 + 1]; - fr = src_argb0[4 + 2]; - a = src_argb0[4 + 3]; + fb = src_argb[4 + 0]; + fg = src_argb[4 + 1]; + fr = src_argb[4 + 2]; + a = src_argb[4 + 3]; bb = src_argb1[4 + 0]; bg = src_argb1[4 + 1]; br = src_argb1[4 + 2]; @@ -2343,16 +3154,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[4 + 1] = BLEND(fg, bg, a); dst_argb[4 + 2] = BLEND(fr, br, a); dst_argb[4 + 3] = 255u; - src_argb0 += 8; + src_argb += 8; src_argb1 += 8; dst_argb += 8; } if (width & 1) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; @@ -2385,10 +3196,14 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND +#if LIBYUV_ATTENUATE_DUP +// This code mimics the SSSE3 version for better testability. #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 +#else +#define ATTENUATE(f, a) (f * a + 128) >> 8 +#endif // Multiply source RGB by alpha and store to destination. -// This code mimics the SSSE3 version for better testability. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { @@ -2472,6 +3287,14 @@ const uint32_t fixed_invtbl8[256] = { T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T +#if LIBYUV_UNATTENUATE_DUP +// This code mimics the Intel SIMD version for better testability. +#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16) +#else +#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8) +#endif + +// mimics the Intel SIMD code for exactness. void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { @@ -2482,13 +3305,11 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint32_t r = src_argb[2]; const uint32_t a = src_argb[3]; const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point - b = (b * ia) >> 8; - g = (g * ia) >> 8; - r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. - dst_argb[0] = clamp255(b); - dst_argb[1] = clamp255(g); - dst_argb[2] = clamp255(r); + dst_argb[0] = UNATTENUATE(b, ia); + dst_argb[1] = UNATTENUATE(g, ia); + dst_argb[2] = UNATTENUATE(r, ia); dst_argb[3] = a; src_argb += 4; dst_argb += 4; @@ -2519,8 +3340,11 @@ void CumulativeSumToAverageRow_C(const int32_t* tl, int area, uint8_t* dst, int count) { - float ooa = 1.0f / area; + float ooa; int i; + assert(area != 0); + + ooa = 1.0f / area; for (i = 0; i < count; ++i) { dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); @@ -2576,6 +3400,17 @@ static void HalfRow_16_C(const uint16_t* src_uv, } } +static void HalfRow_16To8_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale); + } +} + // C version 2x2 -> 2x1. void InterpolateRow_C(uint8_t* dst_ptr, const uint8_t* src_ptr, @@ -2586,6 +3421,9 @@ void InterpolateRow_C(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; @@ -2594,21 +3432,16 @@ void InterpolateRow_C(uint8_t* dst_ptr, HalfRow_C(src_ptr, src_stride, dst_ptr, width); return; } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; - dst_ptr[1] = - (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { + for (x = 0; x < width; ++x) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } } +// C version 2x2 -> 2x1. void InterpolateRow_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, @@ -2618,23 +3451,62 @@ void InterpolateRow_16_C(uint16_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; int x; - if (source_y_fraction == 0) { + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width * 2); return; } - if (source_y_fraction == 128) { + if (y1_fraction == 128) { HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); return; } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; + for (x = 0; x < width; ++x) { + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + ++src_ptr; + ++src_ptr1; + ++dst_ptr; } - if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; +} + +// C version 2x2 16 bit-> 2x1 8 bit. +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits + +void InterpolateRow_16To8_C(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int x; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + + if (source_y_fraction == 0) { + Convert16To8Row_C(src_ptr, dst_ptr, scale, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width); + return; + } + for (x = 0; x < width; ++x) { + dst_ptr[0] = C16TO8( + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, + scale); + src_ptr += 1; + src_ptr1 += 1; + dst_ptr += 1; } } @@ -2873,7 +3745,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 -#if !(defined(_MSC_VER) && defined(_M_IX86)) && \ +#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, @@ -3175,12 +4047,93 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif +#ifdef HAS_RGB24TOYJROW_AVX2 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_AVX2 + +#ifdef HAS_RAWTOYJROW_AVX2 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_AVX2 + +#ifdef HAS_RGB24TOYJROW_SSSE3 +// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RGB24TOYJROW_SSSE3 + +#ifdef HAS_RAWTOYJROW_SSSE3 +// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } +} +#endif // HAS_RAWTOYJROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_16TO8_AVX2 +void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + // Row buffer for intermediate 16 bit pixels. + SIMD_ALIGNED(uint16_t row[MAXTWIDTH]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction); + Convert16To8Row_AVX2(row, dst_ptr, scale, twidth); + src_ptr += twidth; + dst_ptr += twidth; + width -= twidth; + } +} +#endif // HAS_INTERPOLATEROW_16TO8_AVX2 + float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; -#if defined(__clang__) -#pragma clang loop vectorize_width(4) -#endif for (i = 0; i < width; ++i) { float v = *src++; fsum += v * v; @@ -3231,6 +4184,29 @@ void GaussCol_C(const uint16_t* src0, } } +void GaussRow_F32_C(const float* src, float* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * + (1.0f / 256.0f); + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, @@ -3256,13 +4232,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y, } // Filter 2 rows of AYUV UV's (444) into UV (420). +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { // Output a row of UV values, filtering 2x2 rows of AYUV. int x; - for (x = 0; x < width; x += 2) { + for (x = 0; x < width - 1; x += 2) { dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; @@ -3273,12 +4250,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv, dst_uv += 2; } if (width & 1) { - dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 0] + 2) >> - 2; - dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 1] + 2) >> - 2; + dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; + dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; } } @@ -3289,7 +4262,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv, int width) { // Output a row of VU values, filtering 2x2 rows of AYUV. int x; - for (x = 0; x < width; x += 2) { + for (x = 0; x < width - 1; x += 2) { dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; @@ -3300,12 +4273,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv, dst_vu += 2; } if (width & 1) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 0] + 2) >> - 2; - dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 1] + 2) >> - 2; + dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; + dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; } } @@ -3319,7 +4288,8 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { } } -void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { int x; for (x = 0; x < width; ++x) { uint8_t u = src_uv[0]; @@ -3331,16 +4301,27 @@ void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } } -// divide values by weights and provide mask to indicate weight of 0. -void FloatDivToByteRow_C(const float* src_weights, - const float* src_values, - uint8_t* dst_out, - uint8_t* dst_mask, - int width) { +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { int x; - for (x = 0; x < width; ++x) { - dst_out[x] = Clamp(src_values[x] / src_weights[x]); - dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + + src_u[src_stride_u + 1] + 2) >> + 2; + dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + + src_v[src_stride_v + 1] + 2) >> + 2; + src_u += 2; + src_v += 2; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; + dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; } } diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc deleted file mode 100644 index 11f78e0d..00000000 --- a/files/source/row_dspr2.cc +++ /dev/null @@ -1,1721 +0,0 @@ -/* - * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -#ifdef HAS_COPYROW_MIPS -void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { - __asm__ __volatile__( - ".set noreorder \n" - ".set noat \n" - "slti $at, %[count], 8 \n" - "bne $at ,$zero, $last8 \n" - "xor $t8, %[src], %[dst] \n" - "andi $t8, $t8, 0x3 \n" - - "bne $t8, $zero, unaligned \n" - "negu $a3, %[dst] \n" - // make dst/src aligned - "andi $a3, $a3, 0x3 \n" - "beq $a3, $zero, $chk16w \n" - // word-aligned now count is the remining bytes count - "subu %[count], %[count], $a3 \n" - - "lwr $t8, 0(%[src]) \n" - "addu %[src], %[src], $a3 \n" - "swr $t8, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - - // Now the dst/src are mutually word-aligned with word-aligned addresses - "$chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, chk8w \n" - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" - // t0 is the "past the end" address - - // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be - // past - // the "t0-32" address - // This means: for x=128 the last "safe" a1 address is "t0-160" - // Alternatively, for x=64 the last "safe" a1 address is "t0-96" - // we will use "pref 30,128(a1)", so "t0-160" is the limit - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line of src - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // In case the a1 > t9 don't use "pref 30" at all - "sltu $v1, $t9, %[dst] \n" - "bgtz $v1, $loop16w \n" - "nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$loop16w: \n" - "pref 0, 96(%[src]) \n" - "lw $t0, 0(%[src]) \n" - "bgtz $v1, $skip_pref30_96 \n" // skip - "lw $t1, 4(%[src]) \n" - "pref 30, 96(%[dst]) \n" // continue - "$skip_pref30_96: \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lw $t0, 32(%[src]) \n" - "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) - "lw $t1, 36(%[src]) \n" - "pref 30, 128(%[dst]) \n" // set dest, addr 128 - "$skip_pref30_128: \n" - "lw $t2, 40(%[src]) \n" - "lw $t3, 44(%[src]) \n" - "lw $t4, 48(%[src]) \n" - "lw $t5, 52(%[src]) \n" - "lw $t6, 56(%[src]) \n" - "lw $t7, 60(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst], %[dst], 64 \n" // adding 64 to dest - "sltu $v1, $t9, %[dst] \n" - "bne %[dst], $a3, $loop16w \n" - " addiu %[src], %[src], 64 \n" // adding 64 to src - "move %[count], $t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count past 32-bytes - "beq %[count], $t8, chk1w \n" - // count=t8,no 32-byte chunk - " nop \n" - - "lw $t0, 0(%[src]) \n" - "lw $t1, 4(%[src]) \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, $last8 \n" - " subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - // copying in words (4-byte chunks) - "$wordCopy_loop: \n" - "lw $t3, 0(%[src]) \n" - // the first t3 may be equal t0 ... optimize? - "addiu %[src], %[src],4 \n" - "addiu %[dst], %[dst],4 \n" - "bne %[dst], $a3,$wordCopy_loop \n" - " sw $t3, -4(%[dst]) \n" - - // For the last (<8) bytes - "$last8: \n" - "blez %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 -last dst address - "$last8loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst], $a3, $last8loop \n" - " sb $v1, -1(%[dst]) \n" - - "leave: \n" - " j $ra \n" - " nop \n" - - // - // UNALIGNED case - // - - "unaligned: \n" - // got here with a3="negu a1" - "andi $a3, $a3, 0x3 \n" // a1 is word aligned? - "beqz $a3, $ua_chk16w \n" - " subu %[count], %[count], $a3 \n" - // bytes left after initial a3 bytes - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 - "swr $v1, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - // below the dst will be word aligned (NOTE1) - "$ua_chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, ua_chk8w \n" - // if a2==t8, no 64-byte chunks - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" // t0 "past the end" - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line addr 32 - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // safe, as we have at least 64 bytes ahead - // In case the a1 > t9 don't use "pref 30" at all - "sltu $v1, $t9, %[dst] \n" - "bgtz $v1, $ua_loop16w \n" - // skip "pref 30,64(a1)" for too short arrays - " nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$ua_loop16w: \n" - "pref 0, 96(%[src]) \n" - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "bgtz $v1, $ua_skip_pref30_96 \n" - " lwl $t1, 7(%[src]) \n" - "pref 30, 96(%[dst]) \n" - // continue setting up the dest, addr 96 - "$ua_skip_pref30_96: \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lwr $t0, 32(%[src]) \n" - "lwl $t0, 35(%[src]) \n" - "lwr $t1, 36(%[src]) \n" - "bgtz $v1, ua_skip_pref30_128 \n" - " lwl $t1, 39(%[src]) \n" - "pref 30, 128(%[dst]) \n" - // continue setting up the dest, addr 128 - "ua_skip_pref30_128: \n" - - "lwr $t2, 40(%[src]) \n" - "lwl $t2, 43(%[src]) \n" - "lwr $t3, 44(%[src]) \n" - "lwl $t3, 47(%[src]) \n" - "lwr $t4, 48(%[src]) \n" - "lwl $t4, 51(%[src]) \n" - "lwr $t5, 52(%[src]) \n" - "lwl $t5, 55(%[src]) \n" - "lwr $t6, 56(%[src]) \n" - "lwl $t6, 59(%[src]) \n" - "lwr $t7, 60(%[src]) \n" - "lwl $t7, 63(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst],%[dst],64 \n" // adding 64 to dest - "sltu $v1,$t9,%[dst] \n" - "bne %[dst],$a3,$ua_loop16w \n" - " addiu %[src],%[src],64 \n" // adding 64 to src - "move %[count],$t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "ua_chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count - "beq %[count], $t8, $ua_chk1w \n" - // when count==t8, no 32-byte chunk - - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "lwl $t1, 7(%[src]) \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "$ua_chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, ua_smallCopy \n" - "subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - - // copying in words (4-byte chunks) - "$ua_wordCopy_loop: \n" - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addiu %[src], %[src], 4 \n" - "addiu %[dst], %[dst], 4 \n" - // note: dst=a1 is word aligned here, see NOTE1 - "bne %[dst], $a3, $ua_wordCopy_loop \n" - " sw $v1,-4(%[dst]) \n" - - // Now less than 4 bytes (value in count) left to copy - "ua_smallCopy: \n" - "beqz %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 = last dst address - "$ua_smallCopy_loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst],$a3,$ua_smallCopy_loop \n" - " sb $v1, -1(%[dst]) \n" - - "j $ra \n" - " nop \n" - ".set at \n" - ".set reorder \n" - : [dst] "+r"(dst), [src] "+r"(src) - : [count] "r"(count) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1", - "at"); -} -#endif // HAS_COPYROW_MIPS - -// DSPR2 functions -#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \ - (__mips_isa_rev < 6) - -void SplitUVRow_DSPR2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, - int width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - "1: \n" - "addiu $t4, $t4, -1 \n" - "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | - // U10 - "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | - // U12 - "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | - // U14 - "addiu %[src_uv], %[src_uv], 32 \n" - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | - // V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | - // U12 - "sw $t9, 0(%[dst_v]) \n" - "sw $t0, 0(%[dst_u]) \n" - "sw $t1, 4(%[dst_v]) \n" - "sw $t2, 4(%[dst_u]) \n" - "sw $t3, 8(%[dst_v]) \n" - "sw $t5, 8(%[dst_u]) \n" - "sw $t6, 12(%[dst_v]) \n" - "sw $t7, 12(%[dst_u]) \n" - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u), - [dst_v] "+r"(dst_v) - : - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t4, %[width], 4 \n" // multiplies of 16 - "andi $t5, %[width], 0xf \n" - "blez $t4, 2f \n" - " addu %[src], %[src], %[width] \n" // src += width - - "1: \n" - "lw $t0, -16(%[src]) \n" // |3|2|1|0| - "lw $t1, -12(%[src]) \n" // |7|6|5|4| - "lw $t2, -8(%[src]) \n" // |11|10|9|8| - "lw $t3, -4(%[src]) \n" // |15|14|13|12| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t1, $t1 \n" // |6|7|4|5| - "wsbh $t2, $t2 \n" // |10|11|8|9| - "wsbh $t3, $t3 \n" // |14|15|12|13| - "rotr $t0, $t0, 16 \n" // |0|1|2|3| - "rotr $t1, $t1, 16 \n" // |4|5|6|7| - "rotr $t2, $t2, 16 \n" // |8|9|10|11| - "rotr $t3, $t3, 16 \n" // |12|13|14|15| - "addiu %[src], %[src], -16 \n" - "addiu $t4, $t4, -1 \n" - "sw $t3, 0(%[dst]) \n" // |15|14|13|12| - "sw $t2, 4(%[dst]) \n" // |11|10|9|8| - "sw $t1, 8(%[dst]) \n" // |7|6|5|4| - "sw $t0, 12(%[dst]) \n" // |3|2|1|0| - "bgtz $t4, 1b \n" - " addiu %[dst], %[dst], 16 \n" - "beqz $t5, 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, -1(%[src]) \n" - "addiu $t5, $t5, -1 \n" - "addiu %[src], %[src], -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgez $t5, 2b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src] "+r"(src), [dst] "+r"(dst) - : [width] "r"(width) - : "t0", "t1", "t2", "t3", "t4", "t5"); -} - -void MirrorUVRow_DSPR2(const uint8* src_uv, - uint8* dst_u, - uint8* dst_v, - int width) { - int x; - int y; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "addu $t4, %[width], %[width] \n" - "srl %[x], %[width], 4 \n" - "andi %[y], %[width], 0xf \n" - "blez %[x], 2f \n" - " addu %[src_uv], %[src_uv], $t4 \n" - - "1: \n" - "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| - "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| - "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| - "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| - "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| - "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| - "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| - "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| - - "rotr $t0, $t0, 16 \n" // |1|0|3|2| - "rotr $t1, $t1, 16 \n" // |5|4|7|6| - "rotr $t2, $t2, 16 \n" // |9|8|11|10| - "rotr $t3, $t3, 16 \n" // |13|12|15|14| - "rotr $t4, $t4, 16 \n" // |17|16|19|18| - "rotr $t6, $t6, 16 \n" // |21|20|23|22| - "rotr $t7, $t7, 16 \n" // |25|24|27|26| - "rotr $t8, $t8, 16 \n" // |29|28|31|30| - "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| - "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| - "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| - "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| - "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| - "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| - "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| - "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| - "addiu %[src_uv], %[src_uv], -32 \n" - "addiu %[x], %[x], -1 \n" - "swr $t4, 0(%[dst_u]) \n" - "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| - "swr $t6, 0(%[dst_v]) \n" - "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| - "swr $t3, 4(%[dst_v]) \n" - "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| - "swr $t0, 8(%[dst_u]) \n" - "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| - "swr $t1, 8(%[dst_v]) \n" - "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| - "swr $t9, 12(%[dst_u]) \n" - "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| - "swr $t5, 12(%[dst_v]) \n" - "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz %[x], 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - "beqz %[y], 3f \n" - " nop \n" - "b 2f \n" - " nop \n" - - "2: \n" - "lbu $t0, -2(%[src_uv]) \n" - "lbu $t1, -1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], -2 \n" - "addiu %[y], %[y], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[y], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v), - [x] "=&r"(x), [y] "=&r"(y) - : [width] "r"(width) - : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9"); -} - -void I422ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint32 tmp_ub = yuvconstants->kUVToB[0]; - uint32 tmp_ug = yuvconstants->kUVToG[0]; - uint32 tmp_vg = yuvconstants->kUVToG[1]; - uint32 tmp_vr = yuvconstants->kUVToR[1]; - uint32 tmp_bb = yuvconstants->kUVBiasB[0]; - uint32 tmp_bg = yuvconstants->kUVBiasG[0]; - uint32 tmp_br = yuvconstants->kUVBiasR[0]; - uint32 yg = yuvconstants->kYToRgb[0]; - uint32 tmp_yg; - uint32 tmp_mask = 0x7fff7fff; - tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); - tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); - tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); - tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); - tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; - tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); - tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); - tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; - yg = yg * 0x0101; - - for (x = 0; x < width - 1; x += 2) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lbu %[tmp_t7], 0(%[src_y]) \n" - "lbu %[tmp_t1], 1(%[src_y]) \n" - "mul %[tmp_t7], %[tmp_t7], %[yg] \n" - "mul %[tmp_t1], %[tmp_t1], %[yg] \n" - "lbu %[tmp_t2], 0(%[src_u]) \n" - "lbu %[tmp_t3], 0(%[src_v]) \n" - "replv.ph %[tmp_t2], %[tmp_t2] \n" - "replv.ph %[tmp_t3], %[tmp_t3] \n" - "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" - "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" - "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" - "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" - "srl %[tmp_t7], %[tmp_t7], 16 \n" - "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" - "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" - "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" - "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" - "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" - "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" - "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" - "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" - "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" - "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" - "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" - "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" - "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" - "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" - "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" - "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n" - "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" - "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n" - "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "sw %[tmp_t8], 0(%[rgb_buf]) \n" - "sw %[tmp_t7], 4(%[rgb_buf]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg), - [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), - [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), - [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask)); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 4 pixels. - } -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_DSPR2(uint8* dst_ptr, - const uint8* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y0_fraction = 256 - source_y_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "replv.ph $t0, %[y0_fraction] \n" - "replv.ph $t1, %[source_y_fraction] \n" - - "1: \n" - "lw $t2, 0(%[src_ptr]) \n" - "lw $t3, 0(%[src_ptr1]) \n" - "lw $t4, 4(%[src_ptr]) \n" - "lw $t5, 4(%[src_ptr1]) \n" - "muleu_s.ph.qbl $t6, $t2, $t0 \n" - "muleu_s.ph.qbr $t7, $t2, $t0 \n" - "muleu_s.ph.qbl $t8, $t3, $t1 \n" - "muleu_s.ph.qbr $t9, $t3, $t1 \n" - "muleu_s.ph.qbl $t2, $t4, $t0 \n" - "muleu_s.ph.qbr $t3, $t4, $t0 \n" - "muleu_s.ph.qbl $t4, $t5, $t1 \n" - "muleu_s.ph.qbr $t5, $t5, $t1 \n" - "addq.ph $t6, $t6, $t8 \n" - "addq.ph $t7, $t7, $t9 \n" - "addq.ph $t2, $t2, $t4 \n" - "addq.ph $t3, $t3, $t5 \n" - "shra_r.ph $t6, $t6, 8 \n" - "shra_r.ph $t7, $t7, 8 \n" - "shra_r.ph $t2, $t2, 8 \n" - "shra_r.ph $t3, $t3, 8 \n" - "precr.qb.ph $t6, $t6, $t7 \n" - "precr.qb.ph $t2, $t2, $t3 \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[src_ptr1], %[src_ptr1], 8 \n" - "addiu %[dst_width], %[dst_width], -8 \n" - "sw $t6, 0(%[dst_ptr]) \n" - "sw $t2, 4(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[dst_ptr], %[dst_ptr], 8 \n" - - ".set pop \n" - : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1), - [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width) - : [source_y_fraction] "r"(source_y_fraction), - [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} -#include <stdio.h> -void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) { - int x; - uint32 tmp_mask = 0xff; - uint32 tmp_t1; - for (x = 0; x < (width - 1); ++x) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "ulw %[tmp_t1], 0(%[src_rgb24]) \n" - "addiu %[dst_argb], %[dst_argb], 4 \n" - "addiu %[src_rgb24], %[src_rgb24], 3 \n" - "ins %[tmp_t1], %[tmp_mask], 24, 8 \n" - "sw %[tmp_t1], -4(%[dst_argb]) \n" - ".set pop \n" - : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb), - [tmp_t1] "=&r"(tmp_t1) - : [tmp_mask] "r"(tmp_mask) - : "memory"); - } - uint8 b = src_rgb24[0]; - uint8 g = src_rgb24[1]; - uint8 r = src_rgb24[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; -} - -void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) { - int x; - uint32 tmp_mask = 0xff; - uint32 tmp_t1, tmp_t2; - for (x = 0; x < (width - 1); ++x) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "ulw %[tmp_t1], 0(%[src_raw]) \n" - "addiu %[dst_argb], %[dst_argb], 4 \n" - "addiu %[src_raw], %[src_raw], 3 \n" - "srl %[tmp_t2], %[tmp_t1], 16 \n" - "ins %[tmp_t1], %[tmp_mask], 24, 8 \n" - "ins %[tmp_t1], %[tmp_t1], 16, 8 \n" - "ins %[tmp_t1], %[tmp_t2], 0, 8 \n" - "sw %[tmp_t1], -4(%[dst_argb]) \n" - ".set pop \n" - : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb), - [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2) - : [tmp_mask] "r"(tmp_mask) - : "memory"); - } - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; -} - -void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, - uint8* dst_argb, - int width) { - int x; - uint32 tmp_mask = 0xff; - uint32 tmp_t1, tmp_t2, tmp_t3; - for (x = 0; x < width; ++x) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lhu %[tmp_t1], 0(%[src_rgb565]) \n" - "addiu %[dst_argb], %[dst_argb], 4 \n" - "addiu %[src_rgb565], %[src_rgb565], 2 \n" - "sll %[tmp_t2], %[tmp_t1], 8 \n" - "ins %[tmp_t2], %[tmp_mask], 24,8 \n" - "ins %[tmp_t2], %[tmp_t1], 3, 16 \n" - "ins %[tmp_t2], %[tmp_t1], 5, 11 \n" - "srl %[tmp_t3], %[tmp_t1], 9 \n" - "ins %[tmp_t2], %[tmp_t3], 8, 2 \n" - "ins %[tmp_t2], %[tmp_t1], 3, 5 \n" - "srl %[tmp_t3], %[tmp_t1], 2 \n" - "ins %[tmp_t2], %[tmp_t3], 0, 3 \n" - "sw %[tmp_t2], -4(%[dst_argb]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565), - [dst_argb] "+r"(dst_argb) - : [tmp_mask] "r"(tmp_mask)); - } -} - -void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555, - uint8* dst_argb, - int width) { - int x; - uint32 tmp_t1, tmp_t2, tmp_t3; - for (x = 0; x < width; ++x) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lh %[tmp_t1], 0(%[src_argb1555]) \n" - "addiu %[dst_argb], %[dst_argb], 4 \n" - "addiu %[src_argb1555], %[src_argb1555], 2 \n" - "sll %[tmp_t2], %[tmp_t1], 9 \n" - "ins %[tmp_t2], %[tmp_t1], 4, 15 \n" - "ins %[tmp_t2], %[tmp_t1], 6, 10 \n" - "srl %[tmp_t3], %[tmp_t1], 7 \n" - "ins %[tmp_t2], %[tmp_t3], 8, 3 \n" - "ins %[tmp_t2], %[tmp_t1], 3, 5 \n" - "srl %[tmp_t3], %[tmp_t1], 2 \n" - "ins %[tmp_t2], %[tmp_t3], 0, 3 \n" - "sw %[tmp_t2], -4(%[dst_argb]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555), - [dst_argb] "+r"(dst_argb) - :); - } -} - -void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444, - uint8* dst_argb, - int width) { - int x; - uint32 tmp_t1; - for (x = 0; x < width; ++x) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lh %[tmp_t1], 0(%[src_argb4444]) \n" - "addiu %[dst_argb], %[dst_argb], 4 \n" - "addiu %[src_argb4444], %[src_argb4444], 2 \n" - "ins %[tmp_t1], %[tmp_t1], 16, 16 \n" - "ins %[tmp_t1], %[tmp_t1], 12, 16 \n" - "ins %[tmp_t1], %[tmp_t1], 8, 12 \n" - "ins %[tmp_t1], %[tmp_t1], 4, 8 \n" - "sw %[tmp_t1], -4(%[dst_argb]) \n" - ".set pop \n" - : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb), - [tmp_t1] "=&r"(tmp_t1)); - } -} - -void I444ToARGBRow_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint32 tmp_ub = yuvconstants->kUVToB[0]; - uint32 tmp_ug = yuvconstants->kUVToG[0]; - uint32 tmp_vg = yuvconstants->kUVToG[1]; - uint32 tmp_vr = yuvconstants->kUVToR[1]; - uint32 tmp_bb = yuvconstants->kUVBiasB[0]; - uint32 tmp_bg = yuvconstants->kUVBiasG[0]; - uint32 tmp_br = yuvconstants->kUVBiasR[0]; - uint32 yg = yuvconstants->kYToRgb[0]; - uint32 tmp_mask = 0x7fff7fff; - uint32 tmp_yg; - - tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); - tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); - tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); - tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); - tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; - tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); - tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); - tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; - yg = yg * 0x0101; - - for (x = 0; x < width - 1; x += 2) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lbu %[tmp_t7], 0(%[y_buf]) \n" - "lbu %[tmp_t1], 1(%[y_buf]) \n" - "mul %[tmp_t7], %[tmp_t7], %[yg] \n" - "mul %[tmp_t1], %[tmp_t1], %[yg] \n" - "lh %[tmp_t2], 0(%[u_buf]) \n" - "lh %[tmp_t3], 0(%[v_buf]) \n" - "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n" - "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" - "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" - "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" - "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" - "srl %[tmp_t7], %[tmp_t7], 16 \n" - "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" - "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" - "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" - "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" - "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" - "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" - "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" - "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" - "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" - "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" - "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" - "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" - "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" - "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" - "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" - "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" - "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" - "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" - "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" - "sw %[tmp_t8], 0(%[rgb_buf]) \n" - "sw %[tmp_t7], 4(%[rgb_buf]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) - : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf), - [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), - [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), - [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), - [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask)); - y_buf += 2; - u_buf += 2; - v_buf += 2; - rgb_buf += 8; // Advance 1 pixel. - } -} - -void I422ToARGB4444Row_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint32 tmp_ub = yuvconstants->kUVToB[0]; - uint32 tmp_ug = yuvconstants->kUVToG[0]; - uint32 tmp_vg = yuvconstants->kUVToG[1]; - uint32 tmp_vr = yuvconstants->kUVToR[1]; - uint32 tmp_bb = yuvconstants->kUVBiasB[0]; - uint32 tmp_bg = yuvconstants->kUVBiasG[0]; - uint32 tmp_br = yuvconstants->kUVBiasR[0]; - uint32 yg = yuvconstants->kYToRgb[0]; - uint32 tmp_yg; - uint32 tmp_mask = 0x7fff7fff; - tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); - tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); - tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); - tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); - tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; - tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); - tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); - tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; - yg = yg * 0x0101; - - for (x = 0; x < width - 1; x += 2) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lbu %[tmp_t7], 0(%[src_y]) \n" - "lbu %[tmp_t1], 1(%[src_y]) \n" - "mul %[tmp_t7], %[tmp_t7], %[yg] \n" - "mul %[tmp_t1], %[tmp_t1], %[yg] \n" - "lbu %[tmp_t2], 0(%[src_u]) \n" - "lbu %[tmp_t3], 0(%[src_v]) \n" - "replv.ph %[tmp_t2], %[tmp_t2] \n" - "replv.ph %[tmp_t3], %[tmp_t3] \n" - "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" - "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" - "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" - "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" - "srl %[tmp_t7], %[tmp_t7], 16 \n" - "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" - "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" - "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" - "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" - "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" - "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" - "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" - "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" - "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" - "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" - "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" - "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" - "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" - "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" - "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" - "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" - "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" - "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" - "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" - "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n" - "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n" - "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n" - "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n" - "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n" - "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n" - "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n" - "sw %[tmp_t8], 0(%[dst_argb4444]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) - : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u), - [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub), - [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), - [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), - [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask)); - src_y += 2; - src_u += 1; - src_v += 1; - dst_argb4444 += 4; // Advance 2 pixels. - } -} - -void I422ToARGB1555Row_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint32 tmp_ub = yuvconstants->kUVToB[0]; - uint32 tmp_ug = yuvconstants->kUVToG[0]; - uint32 tmp_vg = yuvconstants->kUVToG[1]; - uint32 tmp_vr = yuvconstants->kUVToR[1]; - uint32 tmp_bb = yuvconstants->kUVBiasB[0]; - uint32 tmp_bg = yuvconstants->kUVBiasG[0]; - uint32 tmp_br = yuvconstants->kUVBiasR[0]; - uint32 yg = yuvconstants->kYToRgb[0]; - uint32 tmp_yg; - uint32 tmp_mask = 0x80008000; - tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); - tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); - tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); - tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); - tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; - tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); - tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); - tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; - yg = yg * 0x0101; - - for (x = 0; x < width - 1; x += 2) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lbu %[tmp_t7], 0(%[src_y]) \n" - "lbu %[tmp_t1], 1(%[src_y]) \n" - "mul %[tmp_t7], %[tmp_t7], %[yg] \n" - "mul %[tmp_t1], %[tmp_t1], %[yg] \n" - "lbu %[tmp_t2], 0(%[src_u]) \n" - "lbu %[tmp_t3], 0(%[src_v]) \n" - "replv.ph %[tmp_t2], %[tmp_t2] \n" - "replv.ph %[tmp_t3], %[tmp_t3] \n" - "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" - "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" - "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" - "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" - "srl %[tmp_t7], %[tmp_t7], 16 \n" - "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" - "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" - "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" - "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" - "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" - "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" - "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" - "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" - "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" - "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" - "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" - "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" - "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" - "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" - "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" - "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" - "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" - "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" - "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" - "ins %[tmp_t3], %[tmp_t8], 7, 24 \n" - "ins %[tmp_t3], %[tmp_t8], 10, 16 \n" - "ins %[tmp_t3], %[tmp_t8], 13, 8 \n" - "ins %[tmp_t4], %[tmp_t7], 7, 24 \n" - "ins %[tmp_t4], %[tmp_t7], 10, 16 \n" - "ins %[tmp_t4], %[tmp_t7], 13, 8 \n" - "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n" - "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n" - "sw %[tmp_t8], 0(%[dst_argb1555]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) - : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u), - [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub), - [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), - [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), - [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask)); - src_y += 2; - src_u += 1; - src_v += 1; - dst_argb1555 += 4; // Advance 2 pixels. - } -} - -void NV12ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint32 tmp_ub = yuvconstants->kUVToB[0]; - uint32 tmp_ug = yuvconstants->kUVToG[0]; - uint32 tmp_vg = yuvconstants->kUVToG[1]; - uint32 tmp_vr = yuvconstants->kUVToR[1]; - uint32 tmp_bb = yuvconstants->kUVBiasB[0]; - uint32 tmp_bg = yuvconstants->kUVBiasG[0]; - uint32 tmp_br = yuvconstants->kUVBiasR[0]; - uint32 yg = yuvconstants->kYToRgb[0]; - uint32 tmp_mask = 0x7fff7fff; - uint32 tmp_yg; - tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff); - tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff); - tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff); - tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff); - tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001; - tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff); - tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff); - tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001; - yg = yg * 0x0101; - - for (x = 0; x < width - 1; x += 2) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lbu %[tmp_t7], 0(%[src_y]) \n" - "lbu %[tmp_t1], 1(%[src_y]) \n" - "mul %[tmp_t7], %[tmp_t7], %[yg] \n" - "mul %[tmp_t1], %[tmp_t1], %[yg] \n" - "lbu %[tmp_t2], 0(%[src_uv]) \n" - "lbu %[tmp_t3], 1(%[src_uv]) \n" - "replv.ph %[tmp_t2], %[tmp_t2] \n" - "replv.ph %[tmp_t3], %[tmp_t3] \n" - "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n" - "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n" - "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n" - "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n" - "srl %[tmp_t7], %[tmp_t7], 16 \n" - "ins %[tmp_t1], %[tmp_t7], 0, 16 \n" - "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n" - "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n" - "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n" - "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n" - "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n" - "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n" - "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n" - "shra.ph %[tmp_t7], %[tmp_t7], 6 \n" - "shra.ph %[tmp_t8], %[tmp_t8], 6 \n" - "shra.ph %[tmp_t9], %[tmp_t9], 6 \n" - "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n" - "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n" - "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n" - "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n" - "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n" - "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n" - "ins %[tmp_t7], %[tmp_t8], 16, 16 \n" - "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n" - "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n" - "sw %[tmp_t8], 0(%[rgb_buf]) \n" - "sw %[tmp_t7], 4(%[rgb_buf]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9) - : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg), - [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), - [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), - [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf), - [tmp_mask] "r"(tmp_mask)); - - src_y += 2; - src_uv += 2; - rgb_buf += 8; // Advance 2 pixels. - } -} - -void BGRAToUVRow_DSPR2(const uint8* src_rgb0, - int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, - int width) { - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; - int x; - int const1 = 0xffda0000; - int const2 = 0x0070ffb6; - int const3 = 0x00700000; - int const4 = 0xffeeffa2; - int const5 = 0x100; - for (x = 0; x < width - 1; x += 2) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_rgb0]) \n" - "lw %[tmp_t2], 4(%[src_rgb0]) \n" - "lw %[tmp_t3], 0(%[src_rgb1]) \n" - "lw %[tmp_t4], 4(%[src_rgb1]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" - "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" - "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" - "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" - "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" - "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" - "extr_r.w %[tmp_t7], $ac0, 9 \n" - "extr_r.w %[tmp_t8], $ac1, 9 \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "addiu %[dst_v], %[dst_v], 1 \n" - "addiu %[src_rgb0], %[src_rgb0], 8 \n" - "addiu %[src_rgb1], %[src_rgb1], 8 \n" - "sb %[tmp_t7], -1(%[dst_u]) \n" - "sb %[tmp_t8], -1(%[dst_v]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), - [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) - : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), - [const4] "r"(const4), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi"); - } -} - -void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { - int x; - int const1 = 0x00420000; - int const2 = 0x00190081; - int const5 = 0x40; - for (x = 0; x < width; x += 4) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_argb0]) \n" - "lw %[tmp_t2], 4(%[src_argb0]) \n" - "lw %[tmp_t3], 8(%[src_argb0]) \n" - "lw %[tmp_t4], 12(%[src_argb0]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "mult $ac2, %[const5], %[const5] \n" - "mult $ac3, %[const5], %[const5] \n" - "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" - "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" - "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" - "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" - "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" - "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" - "extr_r.w %[tmp_t1], $ac0, 8 \n" - "extr_r.w %[tmp_t2], $ac1, 8 \n" - "extr_r.w %[tmp_t3], $ac2, 8 \n" - "extr_r.w %[tmp_t4], $ac3, 8 \n" - "addiu %[src_argb0],%[src_argb0], 16 \n" - "addiu %[dst_y], %[dst_y], 4 \n" - "sb %[tmp_t1], -4(%[dst_y]) \n" - "sb %[tmp_t2], -3(%[dst_y]) \n" - "sb %[tmp_t3], -2(%[dst_y]) \n" - "sb %[tmp_t4], -1(%[dst_y]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) - : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", - "$ac3hi"); - } -} - -void ABGRToUVRow_DSPR2(const uint8* src_rgb0, - int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, - int width) { - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; - int x; - int const1 = 0xffb6ffda; - int const2 = 0x00000070; - int const3 = 0xffa20070; - int const4 = 0x0000ffee; - int const5 = 0x100; - - for (x = 0; x < width - 1; x += 2) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_rgb0]) \n" - "lw %[tmp_t2], 4(%[src_rgb0]) \n" - "lw %[tmp_t3], 0(%[src_rgb1]) \n" - "lw %[tmp_t4], 4(%[src_rgb1]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" - "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" - "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" - "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" - "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" - "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" - "extr_r.w %[tmp_t7], $ac0, 9 \n" - "extr_r.w %[tmp_t8], $ac1, 9 \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "addiu %[dst_v], %[dst_v], 1 \n" - "addiu %[src_rgb0], %[src_rgb0], 8 \n" - "addiu %[src_rgb1], %[src_rgb1], 8 \n" - "sb %[tmp_t7], -1(%[dst_u]) \n" - "sb %[tmp_t8], -1(%[dst_v]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), - [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) - : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), - [const4] "r"(const4), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi"); - } -} - -void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { - int x; - int const1 = 0x00810019; - int const2 = 0x00000042; - int const5 = 0x40; - for (x = 0; x < width; x += 4) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_argb0]) \n" - "lw %[tmp_t2], 4(%[src_argb0]) \n" - "lw %[tmp_t3], 8(%[src_argb0]) \n" - "lw %[tmp_t4], 12(%[src_argb0]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "mult $ac2, %[const5], %[const5] \n" - "mult $ac3, %[const5], %[const5] \n" - "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" - "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" - "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" - "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" - "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" - "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" - "extr_r.w %[tmp_t1], $ac0, 8 \n" - "extr_r.w %[tmp_t2], $ac1, 8 \n" - "extr_r.w %[tmp_t3], $ac2, 8 \n" - "extr_r.w %[tmp_t4], $ac3, 8 \n" - "addiu %[dst_y], %[dst_y], 4 \n" - "addiu %[src_argb0],%[src_argb0], 16 \n" - "sb %[tmp_t1], -4(%[dst_y]) \n" - "sb %[tmp_t2], -3(%[dst_y]) \n" - "sb %[tmp_t3], -2(%[dst_y]) \n" - "sb %[tmp_t4], -1(%[dst_y]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) - : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", - "$ac3hi"); - } -} - -void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { - int x; - int const1 = 0x00810042; - int const2 = 0x00000019; - int const5 = 0x40; - for (x = 0; x < width; x += 4) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_argb0]) \n" - "lw %[tmp_t2], 4(%[src_argb0]) \n" - "lw %[tmp_t3], 8(%[src_argb0]) \n" - "lw %[tmp_t4], 12(%[src_argb0]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "mult $ac2, %[const5], %[const5] \n" - "mult $ac3, %[const5], %[const5] \n" - "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" - "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" - "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" - "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" - "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" - "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" - "extr_r.w %[tmp_t1], $ac0, 8 \n" - "extr_r.w %[tmp_t2], $ac1, 8 \n" - "extr_r.w %[tmp_t3], $ac2, 8 \n" - "extr_r.w %[tmp_t4], $ac3, 8 \n" - "addiu %[src_argb0],%[src_argb0], 16 \n" - "addiu %[dst_y], %[dst_y], 4 \n" - "sb %[tmp_t1], -4(%[dst_y]) \n" - "sb %[tmp_t2], -3(%[dst_y]) \n" - "sb %[tmp_t3], -2(%[dst_y]) \n" - "sb %[tmp_t4], -1(%[dst_y]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) - : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", - "$ac3hi"); - } -} - -void RGBAToUVRow_DSPR2(const uint8* src_rgb0, - int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, - int width) { - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; - int x; - int const1 = 0xffb60070; - int const2 = 0x0000ffda; - int const3 = 0xffa2ffee; - int const4 = 0x00000070; - int const5 = 0x100; - - for (x = 0; x < width - 1; x += 2) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n" - "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n" - "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n" - "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" - "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" - "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" - "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" - "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" - "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" - "extr_r.w %[tmp_t7], $ac0, 9 \n" - "extr_r.w %[tmp_t8], $ac1, 9 \n" - "addiu %[src_rgb0], %[src_rgb0], 8 \n" - "addiu %[src_rgb1], %[src_rgb1], 8 \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "addiu %[dst_v], %[dst_v], 1 \n" - "sb %[tmp_t7], -1(%[dst_u]) \n" - "sb %[tmp_t8], -1(%[dst_v]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), - [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) - : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), - [const4] "r"(const4), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi"); - } -} - -void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) { - int x; - int const1 = 0x00420081; - int const2 = 0x00190000; - int const5 = 0x40; - for (x = 0; x < width; x += 4) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_argb0]) \n" - "lw %[tmp_t2], 4(%[src_argb0]) \n" - "lw %[tmp_t3], 8(%[src_argb0]) \n" - "lw %[tmp_t4], 12(%[src_argb0]) \n" - "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "mult $ac2, %[const5], %[const5] \n" - "mult $ac3, %[const5], %[const5] \n" - "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n" - "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n" - "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n" - "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n" - "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n" - "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n" - "extr_r.w %[tmp_t1], $ac0, 8 \n" - "extr_r.w %[tmp_t2], $ac1, 8 \n" - "extr_r.w %[tmp_t3], $ac2, 8 \n" - "extr_r.w %[tmp_t4], $ac3, 8 \n" - "addiu %[dst_y], %[dst_y], 4 \n" - "addiu %[src_argb0],%[src_argb0], 16 \n" - "sb %[tmp_t1], -4(%[dst_y]) \n" - "sb %[tmp_t2], -3(%[dst_y]) \n" - "sb %[tmp_t3], -2(%[dst_y]) \n" - "sb %[tmp_t4], -1(%[dst_y]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y) - : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", - "$ac3hi"); - } -} - -void ARGBToUVRow_DSPR2(const uint8* src_rgb0, - int src_stride_rgb, - uint8* dst_u, - uint8* dst_v, - int width) { - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; - int x; - int const1 = 0xffb60070; - int const2 = 0x0000ffda; - int const3 = 0xffa2ffee; - int const4 = 0x00000070; - int const5 = 0x100; - - for (x = 0; x < width - 1; x += 2) { - int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5; - int tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t1], 0(%[src_rgb0]) \n" - "lw %[tmp_t2], 4(%[src_rgb0]) \n" - "lw %[tmp_t3], 0(%[src_rgb1]) \n" - "lw %[tmp_t4], 4(%[src_rgb1]) \n" - "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n" - "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n" - "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n" - "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n" - "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n" - "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n" - "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n" - "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n" - "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n" - "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n" - "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n" - "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n" - "mult $ac0, %[const5], %[const5] \n" - "mult $ac1, %[const5], %[const5] \n" - "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n" - "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n" - "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n" - "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n" - "extr_r.w %[tmp_t7], $ac0, 9 \n" - "extr_r.w %[tmp_t8], $ac1, 9 \n" - "addiu %[src_rgb0], %[src_rgb0], 8 \n" - "addiu %[src_rgb1], %[src_rgb1], 8 \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "addiu %[dst_v], %[dst_v], 1 \n" - "sb %[tmp_t7], -1(%[dst_u]) \n" - "sb %[tmp_t8], -1(%[dst_v]) \n" - ".set pop \n" - : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), - [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4), - [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), - [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1), - [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v) - : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3), - [const4] "r"(const4), [const5] "r"(const5) - : "hi", "lo", "$ac1lo", "$ac1hi"); - } -} - -#endif // __mips_dsp_rev >= 2 - -#endif // defined(__mips__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc index decd3d2e..dce8c439 100644 --- a/files/source/row_gcc.cc +++ b/files/source/row_gcc.cc @@ -9,25 +9,26 @@ */ #include "libyuv/row.h" - #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; +static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, + 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; // JPeg full range. -static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0}; +static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, + 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; + +static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, + 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) @@ -45,8 +46,8 @@ static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA -static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, - 0, 33, 65, 13, 0, 33, 65, 13}; +static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, + 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112}; @@ -55,8 +56,8 @@ static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR -static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, - 33, 65, 13, 0, 33, 65, 13, 0}; +static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, + 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0}; @@ -65,8 +66,8 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, - 0, 13, 65, 33, 0, 13, 65, 33}; +static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, + 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38}; @@ -74,17 +75,15 @@ static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112}; -static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; - -// 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; +static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, + 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; + #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 @@ -97,6 +96,10 @@ static const uvec8 kShuffleMaskRGB24ToARGB = { static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; +// Shuffle table for converting RAW to RGBA. +static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u, + 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u}; + // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, @@ -154,24 +157,24 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -185,35 +188,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -223,35 +226,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -259,29 +262,68 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } +// Same code as RAWToARGB with different shuffler and A in low bits +void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + "psrld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGBA) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -293,44 +335,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -341,47 +383,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -392,34 +434,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -430,35 +472,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -469,35 +511,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm6 \n" + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -512,37 +554,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -571,26 +613,26 @@ static const ulvec8 kPermARGBToRGB24_2 = { void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vmovdqa %3,%%ymm5 \n" - "vmovdqa %4,%%ymm6 \n" - "vmovdqa %5,%%ymm7 \n" + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" - "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" - "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -606,37 +648,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -650,34 +692,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -690,40 +732,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, const uint32_t dither4, int width) { asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -739,35 +781,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -780,38 +822,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -821,26 +863,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -884,31 +926,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -923,31 +965,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -964,25 +1006,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1001,25 +1043,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -1034,222 +1076,490 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, + 6, 6, 5, 5, 4, 4, 7, 7}; +static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, + 14, 14, 13, 13, 12, 12, 15, 15}; + +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" + "movdqa %4,%%xmm3 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pshufb %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" + "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_AB64TOARGBROW_AVX2 +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +// clang-format off + +// TODO(mraptis): Consider passing R, G, B multipliers as parameter. +// round parameter is register containing value to add before shift. +#define RGBTOY(round) \ + "1: \n" \ + "movdqu (%0),%%xmm0 \n" \ + "movdqu 0x10(%0),%%xmm1 \n" \ + "movdqu 0x20(%0),%%xmm2 \n" \ + "movdqu 0x30(%0),%%xmm3 \n" \ + "psubb %%xmm5,%%xmm0 \n" \ + "psubb %%xmm5,%%xmm1 \n" \ + "psubb %%xmm5,%%xmm2 \n" \ + "psubb %%xmm5,%%xmm3 \n" \ + "movdqu %%xmm4,%%xmm6 \n" \ + "pmaddubsw %%xmm0,%%xmm6 \n" \ + "movdqu %%xmm4,%%xmm0 \n" \ + "pmaddubsw %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm4,%%xmm1 \n" \ + "pmaddubsw %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm4,%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "lea 0x40(%0),%0 \n" \ + "phaddw %%xmm0,%%xmm6 \n" \ + "phaddw %%xmm2,%%xmm1 \n" \ + "prefetcht0 1280(%0) \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ + "psrlw $0x8,%%xmm6 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm6 \n" \ + "movdqu %%xmm6,(%1) \n" \ + "lea 0x10(%1),%1 \n" \ + "sub $0x10,%2 \n" \ + "jg 1b \n" + +#define RGBTOY_AVX2(round) \ + "1: \n" \ + "vmovdqu (%0),%%ymm0 \n" \ + "vmovdqu 0x20(%0),%%ymm1 \n" \ + "vmovdqu 0x40(%0),%%ymm2 \n" \ + "vmovdqu 0x60(%0),%%ymm3 \n" \ + "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ + "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ + "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ + "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ + "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ + "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ + "lea 0x80(%0),%0 \n" \ + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ + "prefetcht0 1280(%0) \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ + "vmovdqu %%ymm0,(%1) \n" \ + "lea 0x20(%1),%1 \n" \ + "sub $0x20,%2 \n" \ + "jg 1b \n" \ + "vzeroupper \n" + +// clang-format on + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. +// Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYJROW_SSSE3 -#ifdef HAS_ARGBTOYROW_AVX2 +#ifdef HAS_RGBATOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16. +void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_RGBATOYJROW_SSSE3 + +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" + + LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToY), // %3 - "m"(kAddY16), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ABGRTOYROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqu %6,%%ymm6 \n" + + LABELALIGN RGBTOY_AVX2(ymm7) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kSub128), // %4 + "m"(kAddY16), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOYROW_AVX2 + #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN RGBTOY_AVX2(ymm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64), // %4 + "m"(kSub128), // %4 "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOYJROW_AVX2 +#ifdef HAS_RGBATOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_RGBATOYJROW_AVX2 + #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1266,7 +1576,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1275,46 +1585,46 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1328,8 +1638,71 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, } #endif // HAS_ARGBTOUVROW_AVX2 +#ifdef HAS_ABGRTOUVROW_AVX2 +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kAddUV128), // %5 + "m"(kABGRToV), // %6 + "m"(kABGRToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1338,52 +1711,52 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUVJ128), // %5 + "m"(kSub128), // %5 "m"(kARGBToVJ), // %6 "m"(kARGBToUJ), // %7 "m"(kShufARGBToUV_AVX) // %8 @@ -1393,67 +1766,67 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kARGBToVJ), // %5 "m"(kARGBToUJ), // %6 - "m"(kAddUVJ128) // %7 + "m"(kSub128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 @@ -1464,47 +1837,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1518,91 +1891,74 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1615,125 +1971,91 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" + + LABELALIGN RGBTOY(xmm7) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128), // %4 + "m"(kAddY16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1744,59 +2066,59 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1811,21 +2133,21 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // Read 8 UV from 444 #define READYUV444 \ - "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ - "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" @@ -1835,24 +2157,99 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // TODO(fbarchard): Consider pmulhuw to replace psraw // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. #define READYUV210 \ - "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklwd %%xmm1,%%xmm0 \n" \ - "psraw $0x2,%%xmm0 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +#define READYUVA210 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444 10 bit +#define READYUV410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 444 10 bit. With 8 Alpha. +#define READYUVA410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 4 UV from 422 12 bit, upsample to 8 UV +#define READYUV212 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $0x4,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x4,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444. With 8 Alpha. +#define READYUVA444 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" \ @@ -1861,18 +2258,18 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // Read 4 UV from NV12, upsample to 8 UV #define READNV12 \ - "movq (%[uv_buf]),%%xmm0 \n" \ + "movq (%[uv_buf]),%%xmm3 \n" \ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV #define READNV21 \ - "movq (%[vu_buf]),%%xmm0 \n" \ + "movq (%[vu_buf]),%%xmm3 \n" \ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "pshufb %[kShuffleNV21], %%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" @@ -1881,68 +2278,92 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, #define READYUY2 \ "movdqu (%[yuy2_buf]),%%xmm4 \n" \ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu (%[yuy2_buf]),%%xmm0 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "movdqu (%[yuy2_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. #define READUYVY \ "movdqu (%[uyvy_buf]),%%xmm4 \n" \ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu (%[uyvy_buf]),%%xmm0 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "movdqu (%[uyvy_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + #if defined(__x86_64__) #define YUVTORGB_SETUP(yuvconstants) \ + "pcmpeqb %%xmm13,%%xmm13 \n" \ "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "pxor %%xmm12,%%xmm12 \n" \ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "psllw $7,%%xmm13 \n" \ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "pshufb %%xmm12,%%xmm13 \n" \ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ - "movdqa 192(%[yuvconstants]),%%xmm14 \n" + "movdqa 128(%[yuvconstants]),%%xmm12 \n" + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa %%xmm11,%%xmm0 \n" \ - "pmaddubsw %%xmm8,%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm12,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa %%xmm13,%%xmm2 \n" \ - "pmaddubsw %%xmm10,%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ + "psubb %%xmm13,%%xmm3 \n" \ + "pmulhuw %%xmm11,%%xmm4 \n" \ + "movdqa %%xmm8,%%xmm0 \n" \ + "movdqa %%xmm9,%%xmm1 \n" \ + "movdqa %%xmm10,%%xmm2 \n" \ + "paddw %%xmm12,%%xmm4 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" -#define YUVTORGB_REGS \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + +#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ - "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ - "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ - "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "pcmpeqb %%xmm0,%%xmm0 \n" \ + "pxor %%xmm1,%%xmm1 \n" \ + "psllw $7,%%xmm0 \n" \ + "pshufb %%xmm1,%%xmm0 \n" \ + "psubb %%xmm0,%%xmm3 \n" \ + "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ + "movdqa (%[yuvconstants]),%%xmm0 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ + "paddw %%xmm3,%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + #define YUVTORGB_REGS #endif @@ -2012,16 +2433,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2033,6 +2454,44 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, ); } +#ifdef HAS_I444ALPHATOARGBROW_SSSE3 +void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA444 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I444ALPHATOARGBROW_SSSE3 + void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2041,27 +2500,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" - "subl $0x8,%[width] \n" - "jg 1b \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2087,16 +2546,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2116,21 +2575,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2151,16 +2610,46 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 12 bit YUV to ARGB +void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2181,21 +2670,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2207,6 +2696,176 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, ); } +// 12 bit YUV to AR30 +void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210 + YUVTORGB(yuvconstants) STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA410 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); + // clang-format on +} +#endif + +// 10 bit YUV to AR30 +void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + #ifdef HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, @@ -2218,15 +2877,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2253,15 +2912,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2281,15 +2940,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2309,15 +2968,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2337,15 +2996,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2358,6 +3017,112 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format on } +void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP210 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP410 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2366,16 +3131,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2391,12 +3156,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 16 UV from 444 #define READYUV444_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2404,42 +3169,139 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ - "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" -// Read 8 UV from 210 10 bit, upsample to 16 UV +#define READYUV422_AVX512BW \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \ + "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ + "vpermq $0xd8,%%zmm3,%%zmm3 \n" \ + "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ + "vmovdqu8 (%[y_buf]),%%ymm4 \n" \ + "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ + "vpermq $0xd8,%%zmm4,%%zmm4 \n" \ + "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210, upsample to 16 UV // TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. #define READYUV210_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" +// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. +#define READYUVA210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 410 +#define READYUV410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 212 12 bit, upsample to 16 UV +#define READYUV212_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $0x4,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 16 UV from 410. With 16 Alpha. +#define READYUVA410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 444. With 16 Alpha. +#define READYUVA444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ - "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2450,10 +3312,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ - "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "vmovdqu (%[uv_buf]),%%xmm3 \n" \ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2461,73 +3323,130 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 VU from NV21, upsample to 16 UV. #define READNV21_AVX2 \ - "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "vmovdqu (%[vu_buf]),%%xmm3 \n" \ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ + "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" +// TODO(fbarchard): Remove broadcastb #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ - "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ - "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ - "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ - "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ - "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ - "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vpbroadcastb %%xmm13,%%ymm13 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" + +#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "vpbroadcastq %%xmm8, %%zmm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vpbroadcastb %%xmm13,%%zmm13 \n" \ + "movq 32(%[yuvconstants]),%%xmm9 \n" \ + "vpbroadcastq %%xmm9,%%zmm9 \n" \ + "movq 64(%[yuvconstants]),%%xmm10 \n" \ + "vpbroadcastq %%xmm10,%%zmm10 \n" \ + "movq 96(%[yuvconstants]),%%xmm11 \n" \ + "vpbroadcastq %%xmm11,%%zmm11 \n" \ + "movq 128(%[yuvconstants]),%%xmm12 \n" \ + "vpbroadcastq %%xmm12,%%zmm12 \n" \ + "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \ + "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \ + "vmovdqu8 (%[unperm]),%%zmm18 \n" #define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ - "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ - "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ - "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ + "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ + "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ + "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" -#define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", +#define YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \ + "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \ + "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \ + "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \ + "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \ + "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \ + "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \ + "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \ + "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n" + +#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", +#define YUVTORGB_REGS_AVX512BW \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18", #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ - "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ - "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ - "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ + "vpsllw $7,%%xmm0,%%xmm0 \n" \ + "vpbroadcastb %%xmm0,%%ymm0 \n" \ + "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ + "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ + "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + #define YUVTORGB_REGS_AVX2 #endif @@ -2540,6 +3459,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_AVX512BW(yuvconstants) \ + YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsraw $0x6,%%zmm0,%%zmm0 \n" \ + "vpsraw $0x6,%%zmm1,%%zmm1 \n" \ + "vpsraw $0x6,%%zmm2,%%zmm2 \n" \ + "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \ + "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \ + "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n" + // Store 16 ARGB values. #define STOREARGB_AVX2 \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ @@ -2550,7 +3478,19 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vmovdqu %%ymm1,(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ - "lea 0x40(%[dst_argb]), %[dst_argb] \n" + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 32 ARGB values. +#define STOREARGB_AVX512BW \ + "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \ + "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \ + "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \ + "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ + "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ + "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ + "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \ + "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \ + "lea 0x80(%[dst_argb]), %[dst_argb] \n" // Store 16 AR30 values. #define STOREAR30_AVX2 \ @@ -2590,17 +3530,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2624,18 +3564,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2648,6 +3588,50 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TOARGBROW_AVX512BW) +static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2}; +static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4}; +static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128 +// bytes). +void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX512BW(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX512BW + YUVTORGB_AVX512BW(yuvconstants) + STOREARGB_AVX512BW + "sub $0x20,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm] + [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm] + [unperm]"r"(kUnpermuteAVX512) // %[unperm] + : "memory", "cc", YUVTORGB_REGS_AVX512BW + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX512BW + #if defined(HAS_I422TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). @@ -2659,23 +3643,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2699,18 +3683,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2723,6 +3707,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOARGBROW_AVX2 +#if defined(HAS_I212TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I212TOARGBROW_AVX2 + #if defined(HAS_I210TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). @@ -2734,23 +3753,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2758,11 +3777,239 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I210TOAR30ROW_AVX2 +#if defined(HAS_I212TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I212TOAR30ROW_AVX2 + +#if defined(HAS_I410TOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I210ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I410ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA410_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I410TOAR30ROW_AVX2 + +#if defined(HAS_I444ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y and 16 A producing 16 ARGB. +void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA444_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_I444ALPHATOARGBROW_AVX2 + #if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. @@ -2776,16 +4023,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2815,11 +4062,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2859,16 +4106,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2892,16 +4139,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -2925,16 +4172,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2958,16 +4205,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] @@ -2981,18 +4228,156 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, } #endif // HAS_UYVYTOARGBROW_AVX2 +#if defined(HAS_P210TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P210TOARGBROW_AVX2 + +#if defined(HAS_P410TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P410TOARGBROW_AVX2 + +#if defined(HAS_P210TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P210TOAR30ROW_AVX2 + +#if defined(HAS_P410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P410TOAR30ROW_AVX2 + #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * - // 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" + "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 + "pslld $0x18,%%xmm4 \n" LABELALIGN "1: \n" @@ -3001,8 +4386,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" + "paddsw %%xmm3,%%xmm0 \n" + "psraw $6, %%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" // Step 2: Weave into ARGB @@ -3018,28 +4403,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * - // 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpslld $0x18,%%ymm4,%%ymm4 \n" + "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 + "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" @@ -3049,8 +4432,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsraw $0x6,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" @@ -3060,15 +4443,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" + "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 @@ -3081,16 +4464,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %3,%%xmm5 \n" + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3108,13 +4491,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3125,37 +4508,136 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_MIRRORUVROW_AVX2 +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_AVX2 + +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 : "memory", "cc", "xmm0", "xmm1"); } -#endif // HAS_MIRRORUVROW_SSSE3 +#endif // HAS_MIRRORSPLITUVROW_SSSE3 + +#ifdef HAS_RGB24MIRRORROW_SSSE3 + +// Shuffle first 5 pixels to last 5 mirrored. first byte zero +static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, + 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; + +// Shuffle last 5 pixels to first 5 mirrored. last byte zero +static const uvec8 kShuffleMirrorRGB1 = { + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; + +// Shuffle 5 pixels at a time (15 bytes) +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + intptr_t temp_width = (intptr_t)(width); + src_rgb24 += width * 3 - 48; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // first 5 + "movdqu 15(%0),%%xmm1 \n" // next 5 + "movdqu 30(%0),%%xmm2 \n" // next 5 + "movdqu 32(%0),%%xmm3 \n" // last 1 special + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "lea -0x30(%0),%0 \n" + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0), // %3 + "m"(kShuffleMirrorRGB1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_RGB24MIRRORROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 @@ -3163,17 +4645,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "lea -0x10(%0,%2,4),%0 \n" + "lea -0x10(%0,%2,4),%0 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -3189,15 +4671,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( - "vmovdqu %3,%%ymm5 \n" + "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3213,28 +4695,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,0x00(%1,%2,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -3251,28 +4733,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -3282,6 +4764,63 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, } #endif // HAS_SPLITUVROW_SSE2 +#ifdef HAS_DETILEROW_SSE2 +void DetileRow_SSE2(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "sub $0x10,%2 \n" + "lea (%0,%3),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_SSE2 + +#ifdef HAS_DETILESPLITUVROW_SSSE3 +// TODO(greenjustin): Look into generating these constants instead of loading +// them since this can cause branch mispredicts for fPIC code on 32-bit +// machines. +static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14, + 1, 3, 5, 7, 9, 11, 13, 15}; + +// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very +// slow on older SSE2 processors. +void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqu %4,%%xmm1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea (%0, %5),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "movhps %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "m"(kDeinterlaceUV), // %4 + "r"(src_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1"); +} +#endif // HAS_DETILESPLITUVROW_SSSE3 + #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, @@ -3289,22 +4828,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm0,0x10(%2) \n" "vextractf128 $0x1,%%ymm2,0x20(%2) \n" "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 @@ -3322,21 +4861,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -3346,53 +4885,94 @@ void MergeUVRow_SSE2(const uint8_t* src_u, } #endif // HAS_MERGEUVROW_SSE2 -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width) { + depth = 16 - depth; // clang-format off asm volatile ( - "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %4,%%xmm3 \n" + "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 - : "r"(scale) // %4 + : "r"(depth) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 +#ifdef HAS_SPLITUVROW_16_AVX2 +const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15}; +void SplitUVRow_16_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + depth = 16 - depth; + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(depth), // %4 + "m"(kSplitUVShuffle16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + // clang-format on +} +#endif // HAS_SPLITUVROW_16_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits // 64 = 10 bits @@ -3405,24 +4985,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" - // 16 pixels per loop. + // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3432,6 +5012,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, } #endif // HAS_MULTIPLYROW_16_AVX2 +// Use scale to convert msb formats to lsb, depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// 65536 = 16 bits +#ifdef HAS_DIVIDEROW_16_AVX2 +void DivideRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width), // %2 + "+r"(scale) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits @@ -3443,23 +5063,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3475,25 +5095,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "add $0x20,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3514,25 +5134,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3548,26 +5168,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3578,37 +5198,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, #endif // HAS_CONVERT8TO16ROW_AVX2 #ifdef HAS_SPLITRGBROW_SSSE3 - // Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, - 2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u}; - -static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, - 3u, 6u, 9u, 12u, 15u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 2u, - 5u, 8u, 11u, 14u}; - -static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 0u, 3u, - 6u, 9u, 12u, 15u}; +static const uvec8 kSplitRGBShuffle[9] = { + {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u, + 7u, 10u, 13u}, + {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, + 8u, 11u, 14u}, + {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, + 128u, 128u}, + {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, + 12u, 15u}}; void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_r, @@ -3619,91 +5228,72 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 0(%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 48(%5),%%xmm0 \n" + "pshufb 64(%5),%%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 - : "m"(kShuffleMaskRGBToR0), // %5 - "m"(kShuffleMaskRGBToR1), // %6 - "m"(kShuffleMaskRGBToR2), // %7 - "m"(kShuffleMaskRGBToG0), // %8 - "m"(kShuffleMaskRGBToG1), // %9 - "m"(kShuffleMaskRGBToG2), // %10 - "m"(kShuffleMaskRGBToB0), // %11 - "m"(kShuffleMaskRGBToB1), // %12 - "m"(kShuffleMaskRGBToB2) // %13 + : "r"(&kSplitRGBShuffle[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_SPLITRGBROW_SSSE3 #ifdef HAS_MERGERGBROW_SSSE3 - -// Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, - 2u, 128u, 128u, 3u, 128u, 128u, - 4u, 128u, 128u, 5u}; -static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, - 128u, 2u, 128u, 128u, 3u, 128u, - 128u, 4u, 128u, 128u}; -static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, - 128u, 128u, 2u, 128u, 128u, 3u, - 128u, 128u, 4u, 128u}; - -static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, - 7u, 128u, 128u, 8u, 128u, 128u, - 9u, 128u, 128u, 10u}; -static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, - 128u, 7u, 128u, 128u, 8u, 128u, - 128u, 9u, 128u, 128u}; -static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, - 128u, 128u, 8u, 128u, 128u, 9u, - 128u, 128u, 10u, 128u}; - -static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, - 12u, 128u, 128u, 13u, 128u, 128u, - 14u, 128u, 128u, 15u}; -static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, - 128u, 13u, 128u, 128u, 14u, 128u, - 128u, 15u, 128u, 128u}; -static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, - 128u, 128u, 13u, 128u, 128u, 14u, - 128u, 128u, 15u, 128u}; +// Shuffle table for converting Planar to RGB. +static const uvec8 kMergeRGBShuffle[9] = { + {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, + 128u, 5u}, + {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, + 128u, 128u}, + {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u}, + {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u, + 10u, 128u}, + {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, + 128u, 10u}, + {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u}, + {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, + 15u, 128u, 128u}, + {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u}, + {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u}}; void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_g, @@ -3714,92 +5304,858 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb (%5), %%xmm0 \n" + "pshufb 16(%5), %%xmm1 \n" + "pshufb 32(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb 48(%5), %%xmm0 \n" + "pshufb 64(%5), %%xmm1 \n" + "pshufb 80(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb 96(%5), %%xmm0 \n" + "pshufb 112(%5), %%xmm1 \n" + "pshufb 128(%5), %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_rgb), // %3 "+r"(width) // %4 - : "m"(kShuffleMaskRToRGB0), // %5 - "m"(kShuffleMaskGToRGB0), // %6 - "m"(kShuffleMaskBToRGB0), // %7 - "m"(kShuffleMaskRToRGB1), // %8 - "m"(kShuffleMaskGToRGB1), // %9 - "m"(kShuffleMaskBToRGB1), // %10 - "m"(kShuffleMaskRToRGB2), // %11 - "m"(kShuffleMaskGToRGB2), // %12 - "m"(kShuffleMaskBToRGB2) // %13 + : "r"(&kMergeRGBShuffle[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGERGBROW_SSSE3 +#ifdef HAS_MERGEARGBROW_SSE2 +void MergeARGBRow_SSE2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + + LABELALIGN + "1: \n" + + "movq (%0,%2),%%xmm0 \n" // B + "movq (%0),%%xmm1 \n" // R + "movq (%0,%1),%%xmm2 \n" // G + "punpcklbw %%xmm1,%%xmm0 \n" // BR + "movq (%0,%3),%%xmm1 \n" // A + "punpcklbw %%xmm1,%%xmm2 \n" // GA + "movdqa %%xmm0,%%xmm1 \n" // BR + "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) + "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) + "movdqu %%xmm0,(%4) \n" + "movdqu %%xmm1,16(%4) \n" + + "lea 8(%0),%0 \n" + "lea 32(%4),%4 \n" + "sub $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_MERGEXRGBROW_SSE2 +void MergeXRGBRow_SSE2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "movq (%2),%%xmm0 \n" // B + "movq (%0),%%xmm1 \n" // R + "movq (%1),%%xmm2 \n" // G + "punpcklbw %%xmm1,%%xmm0 \n" // BR + "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) + "punpcklbw %%xmm1,%%xmm2 \n" // GA + "movdqa %%xmm0,%%xmm1 \n" // BR + "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) + "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,16(%3) \n" + + "lea 8(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 32(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEARGBROW_SSE2 + +#ifdef HAS_MERGEARGBROW_AVX2 +void MergeARGBRow_AVX2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0,%2),%%xmm0 \n" // B + "vmovdqu (%0,%1),%%xmm1 \n" // R + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G + "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A + "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%4) \n" // First 8 + "vmovdqu %%ymm1,32(%4) \n" // Next 8 + + "lea 16(%0),%0 \n" + "lea 64(%4),%4 \n" + "sub $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_MERGEXRGBROW_AVX2 +void MergeXRGBRow_AVX2(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "vmovdqu (%2),%%xmm0 \n" // B + "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255) + "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G + "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3) \n" // First 8 + "vmovdqu %%ymm1,32(%3) \n" // Next 8 + + "lea 16(%0),%0 \n" + "lea 16(%1),%1 \n" + "lea 16(%2),%2 \n" + "lea 64(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEARGBROW_AVX2 + +#ifdef HAS_SPLITARGBROW_SSE2 +void SplitARGBRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 + "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B + "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%1,%3) \n" // B + "movhps %%xmm0,(%1,%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + "movhps %%xmm2,(%1,%4) \n" // A + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "sub $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+rm"(width) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_SSE2 +void SplitXRGBRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 + "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B + "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) + "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%3) \n" // B + "movhps %%xmm0,(%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 8(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; +#ifdef HAS_SPLITARGBROW_SSSE3 +void SplitARGBRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "movdqa %6,%%xmm3 \n" + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%1,%3) \n" // B + "movhps %%xmm0,(%1,%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + "movhps %%xmm2,(%1,%4) \n" // A + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "subl $0x8,%5 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(kShuffleMaskARGBSplit) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_SSSE3 +void SplitXRGBRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + "movdqa %5,%%xmm3 \n" + + LABELALIGN + "1: \n" + + "movdqu (%0),%%xmm0 \n" // 00-0F + "movdqu 16(%0),%%xmm1 \n" // 10-1F + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) + "movdqa %%xmm0,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) + "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) + "movlps %%xmm0,(%3) \n" // B + "movhps %%xmm0,(%2) \n" // G + "movlps %%xmm2,(%1) \n" // R + + "lea 32(%0),%0 \n" + "lea 8(%1),%1 \n" + "lea 8(%2),%2 \n" + "lea 8(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskARGBSplit) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif + +#ifdef HAS_SPLITARGBROW_AVX2 +static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; +void SplitARGBRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + + "sub %1,%2 \n" + "sub %1,%3 \n" + "sub %1,%4 \n" + "vmovdqa %7,%%ymm3 \n" + "vbroadcastf128 %6,%%ymm4 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00-0F + "vmovdqu 16(%0),%%xmm1 \n" // 10-1F + "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F + "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermd %%ymm0,%%ymm3,%%ymm0 \n" + "vpermd %%ymm1,%%ymm3,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR + "vmovdqu %%xmm0,(%1,%3) \n" // B + "vextracti128 $1,%%ymm0,(%1) \n" // R + "vmovdqu %%xmm2,(%1,%2) \n" // G + "vextracti128 $1,%%ymm2,(%1,%4) \n" // A + "lea 64(%0),%0 \n" + "lea 16(%1),%1 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(kShuffleMaskARGBSplit), // %6 + "m"(kShuffleMaskARGBPermute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SPLITXRGBROW_AVX2 +void SplitXRGBRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + "vmovdqa %6,%%ymm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00-0F + "vmovdqu 16(%0),%%xmm1 \n" // 10-1F + "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F + "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermd %%ymm0,%%ymm3,%%ymm0 \n" + "vpermd %%ymm1,%%ymm3,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR + "vmovdqu %%xmm0,(%3) \n" // B + "vextracti128 $1,%%ymm0,(%1) \n" // R + "vmovdqu %%xmm2,(%2) \n" // G + + "lea 64(%0),%0 \n" + "lea 16(%1),%1 \n" + "lea 16(%2),%2 \n" + "lea 16(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskARGBSplit), // %5 + "m"(kShuffleMaskARGBPermute) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_MERGEXR30ROW_AVX2 +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = depth - 10; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vmovd %5,%%xmm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1),%%ymm1 \n" + "vmovdqu (%0,%2),%%ymm2 \n" + "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" + "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit + "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB + "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" + "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit + "vpslld $0xa,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%3) \n" + "vmovdqu %%ymm3,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift) // %5 +#else + : "rm"(shift) // %5 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_MERGEAR64ROW_AVX2 +static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vmovdqa %8,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + "vbroadcastss %7,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpermd %%ymm3,%%ymm5,%%ymm3 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%4) \n" + "vmovdqu %%ymm2,0x20(%4) \n" + "vmovdqu %%ymm4,0x40(%4) \n" + "vmovdqu %%ymm1,0x60(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(mask), // %7 + "m"(MergeAR64Permute) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vmovdqa %7,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vbroadcastss %6,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%3) \n" + "vmovdqu %%ymm2,0x20(%3) \n" + "vmovdqu %%ymm4,0x40(%3) \n" + "vmovdqu %%ymm1,0x60(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(mask), // %6 + "m"(MergeAR64Permute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vbroadcastf128 %7,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%4) \n" + "vmovdqu %%ymm0,0x20(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(MergeARGB16To8Shuffle) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vbroadcastf128 %6,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%3) \n" + "vmovdqu %%ymm0,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(MergeARGB16To8Shuffle) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" LABELALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" LABELALIGN "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3814,14 +6170,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3836,7 +6192,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep movsb \n" + "rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -3849,29 +6205,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -3884,21 +6240,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "vmovdqu 0x20(%0),%%ymm2 \n" - "lea 0x40(%0),%0 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -3917,17 +6273,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, LABELALIGN "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 @@ -3945,28 +6301,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "vmovdqa %3,%%ymm4 \n" + "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0), %%ymm0 \n" - "vmovdqu 0x20(%0), %%ymm1 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu 0x40(%0), %%ymm2 \n" - "vmovdqu 0x60(%0), %%ymm3 \n" - "lea 0x80(%0), %0 \n" - "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20, %2 \n" - "jg 1b \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 @@ -3981,31 +6337,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4018,23 +6374,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" - "vpmovzxbd (%0),%%ymm1 \n" - "vpmovzxbd 0x8(%0),%%ymm2 \n" - "lea 0x10(%0),%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -4050,7 +6406,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4061,7 +6417,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosb \n" + "rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -4072,7 +6428,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); asm volatile( - "rep stosl \n" + "rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -4083,21 +6439,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4111,32 +6467,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4150,28 +6506,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4185,16 +6541,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -4208,32 +6564,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4247,28 +6603,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4281,22 +6637,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -4311,32 +6667,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4351,30 +6707,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -4389,17 +6745,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -4413,32 +6769,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4453,30 +6809,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -4493,78 +6849,78 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" + "add $0x3,%3 \n" + "jl 99f \n" // 1 pixel loop. "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -4586,36 +6942,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 @@ -4638,43 +6994,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, uint8_t* dst, int width) { asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" // 32 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src0), // %0 "+r"(src1), // %1 @@ -4688,7 +7044,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha +// Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, @@ -4698,35 +7054,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -4747,29 +7103,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4789,32 +7145,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 @@ -4834,52 +7190,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, int width) { uintptr_t alpha; asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. LABELALIGN "1: \n" // replace VPGATHER - "movzb 0x03(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x07(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "movzb 0x13(%0),%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x17(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x1b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x1f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" // end of VPGATHER - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4896,44 +7252,48 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psubb %%xmm5,%%xmm0 \n" + "psubb %%xmm5,%%xmm1 \n" + "movdqu %%xmm4,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "movdqu %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "phaddw %%xmm0,%%xmm6 \n" + "paddw %%xmm5,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm6,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBGRAYROW_SSSE3 @@ -4954,50 +7314,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "m"(kARGBToSepiaB), // %2 @@ -5015,54 +7375,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5080,40 +7440,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -5131,27 +7491,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -5162,35 +7522,35 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -5201,50 +7561,45 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__AVX2__) - , - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb0, +void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5252,16 +7607,16 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -5272,7 +7627,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8_t* src_argb0, +void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5280,16 +7635,16 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpaddusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -5300,7 +7655,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5308,16 +7663,16 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -5328,7 +7683,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5336,16 +7691,16 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, // 4 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpsubusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -5365,40 +7720,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobelx, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -5419,39 +7774,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0, uint8_t* dst_sobely, int width) { asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -5472,37 +7827,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5519,21 +7874,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_y, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -5554,36 +7909,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -5602,67 +7957,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, const int32_t* previous_cumsum, int width) { asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop. LABELALIGN "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(row), // %0 @@ -5682,119 +8037,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, uint8_t* dst, int count) { asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" // 4 pixel small loop. LABELALIGN "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" // 4 pixel loop LABELALIGN "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "add $0x3,%3 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" "19: \n" : "+r"(topleft), // %0 "+r"(botleft), // %1 @@ -5817,70 +8172,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" // 4 pixel loop LABELALIGN "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" + "add $0x3,%4 \n" + "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 @@ -5899,76 +8254,76 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, void InterpolateRow_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, - int dst_width, + int width, int source_y_fraction) { asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 + "+rm"(width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); @@ -5980,71 +8335,73 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, - int dst_width, + int width, int source_y_fraction) { asm volatile( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN "1: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "vmovdqu (%1),%%ymm0 \n" - "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "rep movsb \n" - "jmp 999f \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 100b \n" "99: \n" "vzeroupper \n" - "999: \n" - : "+D"(dst_ptr), // %0 - "+S"(src_ptr), // %1 - "+cm"(dst_width), // %2 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); @@ -6059,20 +8416,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, int width) { asm volatile( - "movdqu (%3),%%xmm5 \n" + "movdqu (%3),%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6093,16 +8450,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -6120,24 +8477,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6156,24 +8513,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6192,27 +8549,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6231,27 +8588,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, int width) { asm volatile( - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -6269,47 +8626,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, int width) { asm volatile( - "pxor %%xmm3,%%xmm3 \n" + "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -6405,27 +8762,27 @@ void HalfFloatRow_AVX2(const uint16_t* src, int width) { scale *= kScaleBias; asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm2 \n" // 16 shorts - "add $0x20,%0 \n" - "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates - "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vpsrld $0xd,%%ymm3,%%ymm3 \n" - "vpsrld $0xd,%%ymm2,%%ymm2 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -6434,7 +8791,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -6446,8 +8803,8 @@ void HalfFloatRow_F16C(const uint16_t* src, float scale, int width) { asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "sub %0,%1 \n" + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN @@ -6472,7 +8829,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } @@ -6481,7 +8838,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { asm volatile( - "sub %0,%1 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" @@ -6515,21 +8872,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "movzb -0x1(%0),%1 \n" - "movzb 0x03(%3,%1,4),%1 \n" - "mov %b1,-0x1(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6548,18 +8905,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, // 1 pixel loop. LABELALIGN "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "dec %2 \n" - "jg 1b \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 @@ -6578,86 +8935,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uintptr_t pixel_temp; uintptr_t table_temp; asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" : "=&d"(pixel_temp), // %0 "=&a"(table_temp), // %1 "+r"(src_argb), // %2 @@ -6669,126 +9026,306 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#ifdef HAS_NV21TOYUV24ROW_AVX2 - -// begin NV21ToYUV24Row_C avx2 constants -static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00}; - -static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80}; - -static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00}; - -static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05, - 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05}; - -static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, - 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80}; - -static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, - 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f}; - -static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80, - 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80}; - -static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, - 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a}; - -static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, - 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80}; - -// NV21ToYUV24Row_AVX2 +static const uvec8 kYUV24Shuffle[3] = { + {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12}, + {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15}, + {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}}; + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "movdqa (%4),%%xmm4 \n" // 3 shuffler constants + "movdqa 16(%4),%%xmm5 \n" + "movdqa 32(%4),%%xmm6 \n" + "1: \n" + "movdqu (%0),%%xmm2 \n" // load 16 Y values + "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values + "lea 16(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 + "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 + "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 + "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 + "pshufb %%xmm5, %%xmm1 \n" + "pshufb %%xmm6, %%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm1,16(%2) \n" + "movdqu %%xmm2,32(%2) \n" + "lea 48(%2),%2 \n" + "sub $16,%3 \n" // 16 pixels per loop + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - uint8_t* src_y_ptr; - uint64_t src_offset = 0; - uint64_t width64; - - width64 = width; - src_y_ptr = (uint8_t*)src_y; - - asm volatile( - "vmovdqu %5, %%ymm0 \n" // init blend value - "vmovdqu %6, %%ymm1 \n" // init blend value - "vmovdqu %7, %%ymm2 \n" // init blend value - // "sub $0x20, %3 \n" //sub 32 from width for final loop - - LABELALIGN - "1: \n" // label 1 - "vmovdqu (%0,%4), %%ymm3 \n" // src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 - "vmovdqu (%1), %%ymm5 \n" // src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for - // shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for - // shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for - // shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results - "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results - "add $0x20, %4 \n" // add to src buffer - // ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert - "vmovdqu %%ymm4, (%2) \n" // store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h - "add $0x60,%2 \n" // add to dst buffer - // ptr - // "cmp %3, %4 \n" //(width64 - - // 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop - "jg 1b \n" - "vzeroupper \n" // sse-avx2 - // transistions - - : "+r"(src_y), //%0 - "+r"(src_vu), //%1 - "+r"(dst_yuv24), //%2 - "+r"(width64), //%3 - "+r"(src_offset) //%4 - : "m"(kBLEND0), //%5 - "m"(kBLEND1), //%6 - "m"(kBLEND2), //%7 - "m"(kSHUF0), //%8 - "m"(kSHUF1), //%9 - "m"(kSHUF2), //%10 - "m"(kSHUF3), //%11 - "m"(kSHUF4), //%12 - "m"(kSHUF5) //%13 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", - "xmm13", "xmm14", "xmm15"); -} -#endif // HAS_NV21TOYUV24ROW_AVX2 + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants + "vbroadcastf128 16(%4),%%ymm5 \n" + "vbroadcastf128 32(%4),%%ymm6 \n" + + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3 + "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5 + "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7 + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n" + "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm3,(%2) \n" + "vmovdqu %%ymm0,32(%2) \n" + "vmovdqu %%ymm1,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_NV21ToYUV24ROW_AVX512 +// The following VMBI VEX256 code tests okay with the intelsde emulator. +static const lvec8 kYUV24Perm[3] = { + {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36, + 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43}, + {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15, + 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52}, + {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59, + 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}}; + +void NV21ToYUV24Row_AVX512(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants + "vmovdqa 32(%4),%%ymm5 \n" + "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vmovdqa %%ymm2, %%ymm0 \n" + "vmovdqa %%ymm2, %%ymm1 \n" + "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n" + "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n" + "vmovdqu %%ymm0,(%2) \n" + "vmovdqu %%ymm1,32(%2) \n" + "vmovdqu %%ymm2,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Perm[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif // HAS_NV21ToYUV24ROW_AVX512 + +#ifdef HAS_SWAPUVROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + + "movdqu %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_SSSE3 + +#ifdef HAS_SWAPUVROW_AVX2 +void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_AVX2 + +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // load 16 U values + "movdqu (%1),%%xmm1 \n" // load 16 V values + "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row + "movdqu 0(%1,%5,1),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // half size + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" // store 8 UV pixels + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" // 16 src pixels per loop + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // load 32 U values + "vmovdqu (%1),%%ymm1 \n" // load 32 V values + "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row + "vmovdqu 0(%1,%5,1),%%ymm3 \n" + "lea 0x20(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels + "lea 0x20(%2),%2 \n" + "sub $0x20,%3 \n" // 32 src pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { + asm volatile( + "pxor %%xmm1,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" // load float + "maxss %%xmm1, %%xmm0 \n" // clamp to zero + "add 4, %0 \n" + "movd %%xmm0, (%1) \n" // store float + "add 4, %1 \n" + "sub $0x4,%2 \n" // 1 float per loop + "jg 1b \n" + : "+r"(src_x), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} #endif // defined(__x86_64__) || defined(__i386__) diff --git a/files/source/row_lasx.cc b/files/source/row_lasx.cc new file mode 100644 index 00000000..7dd18f40 --- /dev/null +++ b/files/source/row_lasx.cc @@ -0,0 +1,2230 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ + } + +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m256i temp0, temp1; \ + \ + DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lasx_xvld(psrc_v, 0); \ + temp0 = __lasx_xvsub_b(temp0, const_0x80); \ + temp1 = __lasx_xvsub_b(temp1, const_0x80); \ + temp0 = __lasx_vext2xv_h_b(temp0); \ + temp1 = __lasx_vext2xv_h_b(temp1); \ + uv_l = __lasx_xvilvl_h(temp0, temp1); \ + uv_h = __lasx_xvilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m256i temp0, temp1; \ + \ + out_y = __lasx_xvld(psrc_y, 0); \ + temp0 = __lasx_xvldrepl_d(psrc_u, 0); \ + temp1 = __lasx_xvldrepl_d(psrc_v, 0); \ + uv = __lasx_xvilvl_b(temp0, temp1); \ + uv = __lasx_xvsub_b(uv, const_0x80); \ + uv = __lasx_vext2xv_h_b(uv); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ + g_h, r_l, r_h) \ + { \ + __m256i u_l, u_h, v_l, v_h; \ + __m256i yl_ev, yl_od, yh_ev, yh_od; \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvilvl_b(in_y, in_y); \ + temp1 = __lasx_xvilvh_b(in_y, in_y); \ + yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ + yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \ + yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lasx_xvadd_w(yl_ev, yb); \ + yl_od = __lasx_xvadd_w(yl_od, yb); \ + yh_ev = __lasx_xvadd_w(yh_ev, yb); \ + yh_od = __lasx_xvadd_w(yh_od, yb); \ + v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \ + u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \ + v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \ + u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lasx_xvadd_w(yl_ev, u_l); \ + temp1 = __lasx_xvadd_w(yl_od, u_l); \ + temp2 = __lasx_xvadd_w(yh_ev, u_h); \ + temp3 = __lasx_xvadd_w(yh_od, u_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lasx_xvpackev_h(temp1, temp0); \ + b_h = __lasx_xvpackev_h(temp3, temp2); \ + temp0 = __lasx_xvadd_w(yl_ev, v_l); \ + temp1 = __lasx_xvadd_w(yl_od, v_l); \ + temp2 = __lasx_xvadd_w(yh_ev, v_h); \ + temp3 = __lasx_xvadd_w(yh_od, v_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lasx_xvpackev_h(temp1, temp0); \ + r_h = __lasx_xvpackev_h(temp3, temp2); \ + DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lasx_xvsub_w(yl_ev, u_l); \ + temp1 = __lasx_xvsub_w(yl_od, u_l); \ + temp2 = __lasx_xvsub_w(yh_ev, u_h); \ + temp3 = __lasx_xvsub_w(yh_od, u_h); \ + DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lasx_xvpackev_h(temp1, temp0); \ + g_h = __lasx_xvpackev_h(temp3, temp2); \ + } + +// Convert 8 pixels of YUV420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ + { \ + __m256i u_l, v_l, yl_ev, yl_od; \ + __m256i temp0, temp1; \ + \ + in_y = __lasx_xvpermi_d(in_y, 0xD8); \ + temp0 = __lasx_xvilvl_b(in_y, in_y); \ + yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \ + yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \ + DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \ + yl_ev = __lasx_xvadd_w(yl_ev, yb); \ + yl_od = __lasx_xvadd_w(yl_od, yb); \ + v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \ + u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \ + temp0 = __lasx_xvadd_w(yl_ev, u_l); \ + temp1 = __lasx_xvadd_w(yl_od, u_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_b = __lasx_xvpackev_h(temp1, temp0); \ + temp0 = __lasx_xvadd_w(yl_ev, v_l); \ + temp1 = __lasx_xvadd_w(yl_od, v_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_r = __lasx_xvpackev_h(temp1, temp0); \ + u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \ + temp0 = __lasx_xvsub_w(yl_ev, u_l); \ + temp1 = __lasx_xvsub_w(yl_od, u_l); \ + DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \ + DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \ + out_g = __lasx_xvpackev_h(temp1, temp0); \ + } + +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvpackev_b(g_l, b_l); \ + temp1 = __lasx_xvpackev_b(a_l, r_l); \ + temp2 = __lasx_xvpackev_b(g_h, b_h); \ + temp3 = __lasx_xvpackev_b(a_h, r_h); \ + r_l = __lasx_xvilvl_h(temp1, temp0); \ + r_h = __lasx_xvilvh_h(temp1, temp0); \ + g_l = __lasx_xvilvl_h(temp3, temp2); \ + g_h = __lasx_xvilvh_h(temp3, temp2); \ + temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \ + temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \ + temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \ + temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \ + __lasx_xvst(temp0, pdst_argb, 0); \ + __lasx_xvst(temp1, pdst_argb, 32); \ + __lasx_xvst(temp2, pdst_argb, 64); \ + __lasx_xvst(temp3, pdst_argb, 96); \ + pdst_argb += 128; \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ + { \ + __m256i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lasx_xvpackev_b(in_g, in_b); \ + temp1 = __lasx_xvpackev_b(in_a, in_r); \ + temp2 = __lasx_xvilvl_h(temp1, temp0); \ + temp3 = __lasx_xvilvh_h(temp1, temp0); \ + temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \ + temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \ + __lasx_xvst(temp0, pdst_argb, 0); \ + __lasx_xvst(temp1, pdst_argb, 32); \ + pdst_argb += 64; \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ + _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ + } + +void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 64; + __m256i src0, src1; + __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607, + 0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + src0 = __lasx_xvpermi_q(src0, src0, 0x01); + src1 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(src1, dst, 0); + __lasx_xvst(src0, dst, 32); + dst += 64; + src -= 64; + } +} + +void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 16; + __m256i src, dst; + __m256i shuffler = {0x0004000500060007, 0x0000000100020003, + 0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 16) << 1; + for (x = 0; x < len; x++) { + src = __lasx_xvld(src_uv, 0); + dst = __lasx_xvshuf_h(shuffler, src, src); + dst = __lasx_xvpermi_q(dst, dst, 0x01); + __lasx_xvst(dst, dst_uv, 0); + src_uv -= 32; + dst_uv += 32; + } +} + +void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 16; + __m256i src0, src1; + __m256i dst0, dst1; + __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504, + 0x0B0A09080F0E0D0C, 0x0302010007060504}; + src += (width * 4) - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + dst1 = __lasx_xvpermi_q(src0, src0, 0x01); + dst0 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(dst0, dst, 0); + __lasx_xvst(dst1, dst, 32); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_yuy2_0, vec_yuy2_1; + __m256i dst_yuy2_0, dst_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0); + dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20); + dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31); + __lasx_xvst(dst_yuy2_0, dst_yuy2, 0); + __lasx_xvst(dst_yuy2_1, dst_yuy2, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_uyvy0, vec_uyvy1; + __m256i dst_uyvy0, dst_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0); + dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20); + dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31); + __lasx_xvst(dst_uyvy0, dst_uyvy, 0); + __lasx_xvst(dst_uyvy1, dst_uyvy, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422ToRGBARow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422AlphaToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + int res = width & 31; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i zero = __lasx_xvldi(0); + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lasx_xvld(src_a, 0); + a_l = __lasx_xvilvl_b(zero, y); + a_h = __lasx_xvilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 32; + src_u += 16; + src_v += 16; + src_a += 32; + } + if (res) { + __m256i y, uv, r, g, b, a; + a = __lasx_xvld(src_a, 0); + a = __lasx_vext2xv_hu_bu(a); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614, + 0x0504120302100100, 0x0A18090816070614}; + __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, + 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lasx_xvpackev_b(g_l, b_l); + temp1 = __lasx_xvpackev_b(g_h, b_h); + DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, + r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lasx_xvilvl_d(temp1, temp2); + b_h = __lasx_xvilvh_d(temp3, temp1); + temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20); + temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30); + temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31); + __lasx_xvst(temp1, dst_argb, 0); + __lasx_xvst(temp2, dst_argb, 32); + __lasx_xvst(temp3, dst_argb, 64); + dst_argb += 96; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 2); + g_h = __lasx_xvsrli_h(g_h, 2); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 11); + r_h = __lasx_xvslli_h(r_h, 11); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvor_v(r_l, g_l); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_rgb565, 0); + __lasx_xvst(dst_h, dst_rgb565, 32); + dst_rgb565 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000, + 0xF000F000F000F000}; + __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, + 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 4); + b_h = __lasx_xvsrli_h(b_h, 4); + r_l = __lasx_xvsrli_h(r_l, 4); + r_h = __lasx_xvsrli_h(r_h, 4); + g_l = __lasx_xvand_v(g_l, mask); + g_h = __lasx_xvand_v(g_h, mask); + r_l = __lasx_xvslli_h(r_l, 8); + r_h = __lasx_xvslli_h(r_h, 8); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb4444, 0); + __lasx_xvst(dst_h, dst_argb4444, 32); + dst_argb4444 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422ToARGB1555Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000, + 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 3); + g_h = __lasx_xvsrli_h(g_h, 3); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 10); + r_h = __lasx_xvslli_h(r_h, 10); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb1555, 0); + __lasx_xvst(dst_h, dst_argb1555, 32); + dst_argb1555 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + dst0 = __lasx_xvpickev_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0, + src_yuy2_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickod_b(src1, src0); + src1 = __lasx_xvpickod_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + tmp0 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + dst0 = __lasx_xvpickod_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_LASX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0, + src_uyvy_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickev_b(src1, src0); + src1 = __lasx_xvpickev_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3; + __m256i tmp0, tmp1, dst0; + __m256i const_19 = __lasx_xvldi(0x19); + __m256i const_42 = __lasx_xvldi(0x42); + __m256i const_81 = __lasx_xvldi(0x81); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, + src_argb0, 96, src0, src1, src2, src3); + vec0 = __lasx_xvpickev_b(src1, src0); + vec1 = __lasx_xvpickev_b(src3, src2); + vec2 = __lasx_xvpickod_b(src1, src0); + vec3 = __lasx_xvpickod_b(src3, src2); + tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19); + tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19); + tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81); + tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81); + tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42); + tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42); + dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8); + dst0 = __lasx_xvperm_w(dst0, control); + __lasx_xvst(dst0, dst_y, 0); + src_argb0 += 128; + dst_y += 32; + } +} + +void ARGBToUVRow_LASX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m256i src0, src1, src2, src3, src4, src5, src6, src7; + __m256i vec0, vec1, vec2, vec3; + __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038, + 0x0038003800380038, 0x0038003800380038}; + __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025, + 0x0025002500250025, 0x0025002500250025}; + __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013, + 0x0013001300130013, 0x0013001300130013}; + __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f, + 0x002f002f002f002f, 0x002f002f002f002f}; + __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009, + 0x0009000900090009, 0x0009000900090009}; + __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, + src_argb0, 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64, + src_argb1, 96, src4, src5, src6, src7); + vec0 = __lasx_xvaddwev_h_bu(src0, src4); + vec1 = __lasx_xvaddwev_h_bu(src1, src5); + vec2 = __lasx_xvaddwev_h_bu(src2, src6); + vec3 = __lasx_xvaddwev_h_bu(src3, src7); + tmp0 = __lasx_xvpickev_h(vec1, vec0); + tmp1 = __lasx_xvpickev_h(vec3, vec2); + tmp2 = __lasx_xvpickod_h(vec1, vec0); + tmp3 = __lasx_xvpickod_h(vec3, vec2); + vec0 = __lasx_xvaddwod_h_bu(src0, src4); + vec1 = __lasx_xvaddwod_h_bu(src1, src5); + vec2 = __lasx_xvaddwod_h_bu(src2, src6); + vec3 = __lasx_xvaddwod_h_bu(src3, src7); + tmp4 = __lasx_xvpickev_h(vec1, vec0); + tmp5 = __lasx_xvpickev_h(vec3, vec2); + vec0 = __lasx_xvpickev_h(tmp1, tmp0); + vec1 = __lasx_xvpickod_h(tmp1, tmp0); + src0 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp3, tmp2); + vec1 = __lasx_xvpickod_h(tmp3, tmp2); + src1 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp5, tmp4); + vec1 = __lasx_xvpickod_h(tmp5, tmp4); + src2 = __lasx_xvavgr_h(vec0, vec1); + dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A); + dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26); + dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E); + dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12); + dst0 = __lasx_xvperm_w(dst0, control); + dst1 = __lasx_xvperm_w(dst1, control); + dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8); + dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_argb0 += 128; + src_argb1 += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100, + 0x000000000E0D0C0A}; + __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, + src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102, + 0x000000000C0D0E08}; + __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96, + src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRGB565Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300, + 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvpackev_b(zero, tmp1); + tmp1 = __lasx_xvsrli_h(tmp1, 2); + tmp0 = __lasx_xvsll_b(tmp0, shift); + tmp1 = __lasx_xvslli_h(tmp1, 5); + dst0 = __lasx_xvor_v(tmp0, tmp1); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703, + 0x0703070307030703}; + __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200, + 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvsrl_b(tmp1, shift1); + tmp0 = __lasx_xvsll_b(tmp0, shift2); + tmp2 = __lasx_xvpackev_b(zero, tmp1); + tmp3 = __lasx_xvpackod_b(zero, tmp1); + tmp2 = __lasx_xvslli_h(tmp2, 5); + tmp3 = __lasx_xvslli_h(tmp3, 15); + dst0 = __lasx_xvor_v(tmp0, tmp2); + dst0 = __lasx_xvor_v(dst0, tmp3); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp1 = __lasx_xvandi_b(tmp1, 0xF0); + tmp0 = __lasx_xvsrli_b(tmp0, 4); + dst0 = __lasx_xvor_v(tmp1, tmp0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToUV444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3, dst0, dst1; + __m256i const_112 = __lasx_xvldi(112); + __m256i const_74 = __lasx_xvldi(74); + __m256i const_38 = __lasx_xvldi(38); + __m256i const_94 = __lasx_xvldi(94); + __m256i const_18 = __lasx_xvldi(18); + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvpickev_h(src1, src0); + tmp1 = __lasx_xvpickod_h(src1, src0); + tmp2 = __lasx_xvpickev_h(src3, src2); + tmp3 = __lasx_xvpickod_h(src3, src2); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74); + reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74); + reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst0 = __lasx_xvperm_w(dst0, control); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18); + reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18); + reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst1 = __lasx_xvperm_w(dst1, control); + __lasx_xvst(dst0, dst_u, 0); + __lasx_xvst(dst1, dst_v, 0); + dst_u += 32; + dst_v += 32; + src_argb += 128; + } +} + +void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, dst0, dst1; + __m256i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp2 = __lasx_xvilvl_b(zero, src1); + tmp3 = __lasx_xvilvh_b(zero, src1); + dst0 = __lasx_xvmuh_hu(tmp0, tmp2); + dst1 = __lasx_xvmuh_hu(tmp1, tmp3); + dst0 = __lasx_xvpickev_b(dst1, dst0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAddRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvsadd_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvssub_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, reg3, reg4, reg5; + __m256i b, g, r, a, dst0, dst1; + __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000, + 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(tmp0, tmp0); + r = __lasx_xvpackod_b(tmp0, tmp0); + g = __lasx_xvpackev_b(tmp1, tmp1); + a = __lasx_xvpackod_b(tmp1, tmp1); + reg0 = __lasx_xvmulwev_w_hu(b, a); + reg1 = __lasx_xvmulwod_w_hu(b, a); + reg2 = __lasx_xvmulwev_w_hu(r, a); + reg3 = __lasx_xvmulwod_w_hu(r, a); + reg4 = __lasx_xvmulwev_w_hu(g, a); + reg5 = __lasx_xvmulwod_w_hu(g, a); + reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24); + reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24); + reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24); + reg0 = __lasx_xvshuf_h(control, reg0, reg0); + reg2 = __lasx_xvshuf_h(control, reg2, reg2); + reg4 = __lasx_xvshuf_h(control, reg4, reg4); + tmp0 = __lasx_xvpackev_b(reg4, reg0); + tmp1 = __lasx_xvpackev_b(a, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + src_argb += 64; + } +} + +void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i b, g, r; + __m256i zero = __lasx_xvldi(0); + __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0); + + vec_dither = __lasx_xvilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(zero, tmp0); + r = __lasx_xvpackod_b(zero, tmp0); + g = __lasx_xvpackev_b(zero, tmp1); + b = __lasx_xvadd_h(b, vec_dither); + g = __lasx_xvadd_h(g, vec_dither); + r = __lasx_xvadd_h(r, vec_dither); + DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g); + r = __lasx_xvclip255_h(r); + b = __lasx_xvsrai_h(b, 3); + g = __lasx_xvsrai_h(g, 2); + r = __lasx_xvsrai_h(r, 3); + g = __lasx_xvslli_h(g, 5); + r = __lasx_xvslli_h(r, 11); + dst0 = __lasx_xvor_v(b, g); + dst0 = __lasx_xvor_v(dst0, r); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + src_argb += 64; + dst_rgb += 32; + } +} + +void ARGBShuffleRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, dst0, dst1; + __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000, + 0x0C0C0C0C08080808}; + __m256i temp = __lasx_xvldrepl_w(shuffler, 0); + + shuf = __lasx_xvadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + dst0 = __lasx_xvshuf_b(src0, src0, shuf); + dst1 = __lasx_xvshuf_b(src1, src1, shuf); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBShadeRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 8; + __m256i src0, dst0, tmp0, tmp1; + __m256i vec_value = __lasx_xvreplgr2vr_w(value); + + vec_value = __lasx_xvilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb, 0); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp0 = __lasx_xvmuh_hu(tmp0, vec_value); + tmp1 = __lasx_xvmuh_hu(tmp1, vec_value); + dst0 = __lasx_xvpickod_b(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, dst0, dst1; + __m256i const_128 = __lasx_xvldi(0x480); + __m256i const_150 = __lasx_xvldi(0x96); + __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, + 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + reg0 = __lasx_xvdp2_h_bu(tmp0, const_br); + reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lasx_xvadd_h(reg0, reg1); + tmp0 = __lasx_xvpackod_b(reg2, reg2); + tmp1 = __lasx_xvpackod_b(tmp1, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, spb, spg, spr; + __m256i dst0, dst1; + __m256i spb_g = __lasx_xvldi(68); + __m256i spg_g = __lasx_xvldi(88); + __m256i spr_g = __lasx_xvldi(98); + __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311, + 0x2311231123112311}; + __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16, + 0x2D162D162D162D16}; + __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218, + 0x3218321832183218}; + __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100, + 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lasx_xvdp2_h_bu(tmp0, spr_br); + spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lasx_xvsrli_h(spb, 7); + spg = __lasx_xvsrli_h(spg, 7); + spr = __lasx_xvsrli_h(spr, 7); + spg = __lasx_xvsat_hu(spg, 7); + spr = __lasx_xvsat_hu(spr, 7); + reg0 = __lasx_xvpackev_b(spg, spb); + reg1 = __lasx_xvshuf_b(tmp1, spr, shuff); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + } +} + +void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb4444, 0); + src1 = __lasx_xvld(src_argb4444, 32); + DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2); + DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3); + DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2); + DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2, + 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_argb4444 += 64; + dst_argb += 128; + } +} + +void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + tmpa = __lasx_xvsrli_b(tmp1, 7); + tmpa = __lasx_xvneg_b(tmpa); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + dst2 = __lasx_xvilvl_h(reg3, reg2); + dst3 = __lasx_xvilvh_h(reg3, reg2); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_argb1555 += 64; + dst_argb += 128; + } +} + +void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst2 = __lasx_xvilvl_h(reg1, reg0); + dst3 = __lasx_xvilvh_h(reg1, reg0); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_rgb565 += 64; + dst_argb += 128; + } +} + +void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2; + __m256i dst0, dst1, dst2, dst3; + __m256i reg0, reg1, reg2, reg3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100, + 0x100B0A0910080706}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_rgb24, 0); + reg1 = __lasx_xvld(src_rgb24, 32); + reg2 = __lasx_xvld(src_rgb24, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_rgb24 += 96; + dst_argb += 128; + } +} + +void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102, + 0x10090A0B10060708}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_raw, 0); + reg1 = __lasx_xvld(src_raw, 32); + reg2 = __lasx_xvld(src_raw, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_raw += 96; + dst_argb += 128; + } +} + +void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_argb1555 += 64; + dst_y += 32; + } +} + +void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0, + next_argb1555, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + nexg = __lasx_xvsrli_b(tmp2, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg2 = __lasx_xvandi_b(tmp3, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + reg2 = __lasx_xvslli_b(reg2, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + nexg = __lasx_xvor_v(nexg, reg2); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + reg3 = __lasx_xvandi_b(tmp3, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + nexr = __lasx_xvsrli_b(reg3, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvslli_b(nexb, 3); + reg1 = __lasx_xvslli_b(nexg, 3); + reg2 = __lasx_xvslli_b(nexr, 3); + nexb = __lasx_xvsrli_b(nexb, 2); + nexg = __lasx_xvsrli_b(nexg, 2); + nexr = __lasx_xvsrli_b(nexr, 2); + nexb = __lasx_xvor_v(reg0, nexb); + nexg = __lasx_xvor_v(reg1, nexg); + nexr = __lasx_xvor_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_argb1555 += 64; + next_argb1555 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_rgb565 += 64; + } +} + +void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0, + next_rgb565, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + nexr = __lasx_xvandi_b(tmp3, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg3 = __lasx_xvandi_b(tmp3, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + reg2 = __lasx_xvsrli_b(tmp2, 5); + reg3 = __lasx_xvslli_b(reg3, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + reg2 = __lasx_xvslli_b(nexb, 3); + reg3 = __lasx_xvsrli_b(nexb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + nexb = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + reg2 = __lasx_xvslli_b(nexg, 2); + reg3 = __lasx_xvsrli_b(nexg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvsrli_b(tmpr, 5); + reg2 = __lasx_xvsrli_b(nexr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + nexr = __lasx_xvor_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_rgb565 += 64; + next_rgb565 += 64; + } +} + +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, dst0; + __m256i const_129 = __lasx_xvldi(129); + __m256i const_br = {0x4219421942194219, 0x4219421942194219, + 0x4219421942194219, 0x4219421942194219}; + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, + 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, + 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, + 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, + 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_rgb24, 0); + reg1 = __lasx_xvld(src_rgb24, 32); + reg2 = __lasx_xvld(src_rgb24, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); + tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); + tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); + tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_rgb24 += 96; + } +} + +void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64, + next_rgb24, 0, reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_rgb24 += 96; + next_rgb24 += 96; + dst_u += 16; + dst_v += 16; + } +} + +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, dst0; + __m256i const_129 = __lasx_xvldi(129); + __m256i const_br = {0x1942194219421942, 0x1942194219421942, + 0x1942194219421942, 0x1942194219421942}; + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, + 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, + 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, + 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, + 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_raw, 0); + reg1 = __lasx_xvld(src_raw, 32); + reg2 = __lasx_xvld(src_raw, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); + tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); + tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); + tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_raw += 96; + } +} + +void RAWToUVRow_LASX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_raw = src_raw + src_stride_raw; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0, + reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_raw += 96; + next_raw += 96; + dst_u += 16; + dst_v += 16; + } +} + +void NV12ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +void NV12ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); + out_b = __lasx_xvsrli_h(out_b, 3); + out_g = __lasx_xvsrli_h(out_g, 2); + out_r = __lasx_xvsrli_h(out_r, 3); + out_g = __lasx_xvslli_h(out_g, 5); + out_r = __lasx_xvslli_h(out_r, 11); + out_r = __lasx_xvor_v(out_r, out_g); + out_r = __lasx_xvor_v(out_r, out_b); + __lasx_xvst(out_r, dst_rgb565, 0); + src_y += 16; + src_uv += 16; + dst_rgb565 += 32; + } +} + +void NV21ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_uv = __lasx_xvld(src_uv, 0); + vec_uv = __lasx_xvsub_b(vec_uv, const_0x80); + vec_uv = __lasx_vext2xv_h_b(vec_uv); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, dst0; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1; + __m256i const_128 = __lasx_xvldi(0x480); + __m256i const_150 = __lasx_xvldi(0x96); + __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, + 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp2 = __lasx_xvpickev_b(src3, src2); + tmp3 = __lasx_xvpickod_b(src3, src2); + reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); + reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lasx_xvpickod_b(reg1, reg0); + dst0 = __lasx_xvperm_w(dst0, shuff); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_argb += 128; + } +} + +void ARGBToUVJRow_LASX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_argb = src_argb + src_stride_argb; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i nex0, nex1, nex2, nex3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, dst0; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_63 = __lasx_xvldi(0x43F); + __m256i const_42 = __lasx_xvldi(0x42A); + __m256i const_21 = __lasx_xvldi(0x415); + __m256i const_53 = __lasx_xvldi(0x435); + __m256i const_10 = __lasx_xvldi(0x40A); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301, + 0x1F1D0F0D1B190B09}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64, + next_argb, 96, nex0, nex1, nex2, nex3); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp2 = __lasx_xvpickev_b(src3, src2); + tmp3 = __lasx_xvpickod_b(src3, src2); + tmpr = __lasx_xvpickod_b(tmp2, tmp0); + tmpb = __lasx_xvpickev_b(tmp2, tmp0); + tmpg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvpickev_b(nex1, nex0); + tmp1 = __lasx_xvpickod_b(nex1, nex0); + tmp2 = __lasx_xvpickev_b(nex3, nex2); + tmp3 = __lasx_xvpickod_b(nex3, nex2); + nexr = __lasx_xvpickod_b(tmp2, tmp0); + nexb = __lasx_xvpickev_b(tmp2, tmp0); + nexg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb); + tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb); + tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg); + tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg); + reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr); + reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr); + tmpb = __lasx_xvavgr_hu(tmp0, tmp1); + tmpg = __lasx_xvavgr_hu(tmp2, tmp3); + tmpr = __lasx_xvavgr_hu(reg0, reg1); + reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb); + reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr); + reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg); + reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg); + reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr); + reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb); + dst0 = __lasx_xvpackod_b(reg1, reg0); + tmp0 = __lasx_xvpermi_d(dst0, 0x44); + tmp1 = __lasx_xvpermi_d(dst0, 0xEE); + dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 2); + __lasx_xvstelm_d(dst0, dst_u, 8, 1); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_argb += 128; + next_argb += 128; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) diff --git a/files/source/row_lsx.cc b/files/source/row_lsx.cc new file mode 100644 index 00000000..3e8b901a --- /dev/null +++ b/files/source/row_lsx.cc @@ -0,0 +1,1829 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \ + { \ + ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ + } + +// Convert 8 pixels of YUV420 to RGB. +#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ + { \ + __m128i y_ev, y_od, u_l, v_l; \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + \ + tmp0 = __lsx_vilvl_b(in_y, in_y); \ + y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \ + y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \ + y_ev = __lsx_vsrai_w(y_ev, 16); \ + y_od = __lsx_vsrai_w(y_od, 16); \ + y_ev = __lsx_vadd_w(y_ev, yb); \ + y_od = __lsx_vadd_w(y_od, yb); \ + in_vu = __lsx_vilvl_b(zero, in_vu); \ + in_vu = __lsx_vsub_h(in_vu, const_80); \ + u_l = __lsx_vmulwev_w_h(in_vu, vrub); \ + v_l = __lsx_vmulwod_w_h(in_vu, vrub); \ + tmp0 = __lsx_vadd_w(y_ev, u_l); \ + tmp1 = __lsx_vadd_w(y_od, u_l); \ + tmp2 = __lsx_vadd_w(y_ev, v_l); \ + tmp3 = __lsx_vadd_w(y_od, v_l); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp3 = __lsx_vsrai_w(tmp3, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + tmp3 = __lsx_vclip255_w(tmp3); \ + out_b = __lsx_vpackev_h(tmp1, tmp0); \ + out_r = __lsx_vpackev_h(tmp3, tmp2); \ + tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \ + tmp1 = __lsx_vsub_w(y_ev, tmp0); \ + tmp2 = __lsx_vsub_w(y_od, tmp0); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + out_g = __lsx_vpackev_h(tmp2, tmp1); \ + } + +// Convert I444 pixels of YUV420 to RGB. +#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \ + out_r) \ + { \ + __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + \ + y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \ + y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \ + y_ev = __lsx_vsrai_w(y_ev, 16); \ + y_od = __lsx_vsrai_w(y_od, 16); \ + y_ev = __lsx_vadd_w(y_ev, yb); \ + y_od = __lsx_vadd_w(y_od, yb); \ + in_u = __lsx_vsub_h(in_u, const_80); \ + in_v = __lsx_vsub_h(in_v, const_80); \ + u_ev = __lsx_vmulwev_w_h(in_u, ub); \ + u_od = __lsx_vmulwod_w_h(in_u, ub); \ + v_ev = __lsx_vmulwev_w_h(in_v, vr); \ + v_od = __lsx_vmulwod_w_h(in_v, vr); \ + tmp0 = __lsx_vadd_w(y_ev, u_ev); \ + tmp1 = __lsx_vadd_w(y_od, u_od); \ + tmp2 = __lsx_vadd_w(y_ev, v_ev); \ + tmp3 = __lsx_vadd_w(y_od, v_od); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp2 = __lsx_vsrai_w(tmp2, 6); \ + tmp3 = __lsx_vsrai_w(tmp3, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + tmp2 = __lsx_vclip255_w(tmp2); \ + tmp3 = __lsx_vclip255_w(tmp3); \ + out_b = __lsx_vpackev_h(tmp1, tmp0); \ + out_r = __lsx_vpackev_h(tmp3, tmp2); \ + u_ev = __lsx_vpackev_h(in_u, in_v); \ + u_od = __lsx_vpackod_h(in_u, in_v); \ + v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \ + v_od = __lsx_vdp2_w_h(u_od, ugvg); \ + tmp0 = __lsx_vsub_w(y_ev, v_ev); \ + tmp1 = __lsx_vsub_w(y_od, v_od); \ + tmp0 = __lsx_vsrai_w(tmp0, 6); \ + tmp1 = __lsx_vsrai_w(tmp1, 6); \ + tmp0 = __lsx_vclip255_w(tmp0); \ + tmp1 = __lsx_vclip255_w(tmp1); \ + out_g = __lsx_vpackev_h(tmp1, tmp0); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ + { \ + __m128i temp0, temp1; \ + __m128i dst0, dst1; \ + \ + temp0 = __lsx_vpackev_b(in_g, in_b); \ + temp1 = __lsx_vpackev_b(in_a, in_r); \ + dst0 = __lsx_vilvl_h(temp1, temp0); \ + dst1 = __lsx_vilvh_h(temp1, temp0); \ + __lsx_vst(dst0, pdst_argb, 0); \ + __lsx_vst(dst1, pdst_argb, 16); \ + pdst_argb += 32; \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ + __m128i _reg0, _reg1; \ + _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \ + _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ + _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ + } + +void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb4444, 0); + src1 = __lsx_vld(src_argb4444, 16); + tmp0 = __lsx_vandi_b(src0, 0x0F); + tmp1 = __lsx_vandi_b(src0, 0xF0); + tmp2 = __lsx_vandi_b(src1, 0x0F); + tmp3 = __lsx_vandi_b(src1, 0xF0); + reg0 = __lsx_vslli_b(tmp0, 4); + reg2 = __lsx_vslli_b(tmp2, 4); + reg1 = __lsx_vsrli_b(tmp1, 4); + reg3 = __lsx_vsrli_b(tmp3, 4); + DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vilvl_b(tmp1, tmp0); + dst2 = __lsx_vilvl_b(tmp3, tmp2); + dst1 = __lsx_vilvh_b(tmp1, tmp0); + dst3 = __lsx_vilvh_b(tmp3, tmp2); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_argb4444 += 32; + } +} + +void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; + __m128i reg0, reg1, reg2; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb1555, 0); + src1 = __lsx_vld(src_argb1555, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + tmpa = __lsx_vsrli_b(tmp1, 7); + tmpa = __lsx_vneg_b(tmpa); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + dst2 = __lsx_vilvl_h(reg1, reg0); + dst3 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_argb1555 += 32; + } +} + +void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb565, 0); + src1 = __lsx_vld(src_rgb565, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vsrli_b(tmpr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst2 = __lsx_vilvl_h(reg1, reg0); + dst3 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_rgb565 += 32; + } +} + +void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; + __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; + __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb24, 0); + src1 = __lsx_vld(src_rgb24, 16); + src2 = __lsx_vld(src_rgb24, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lsx_vshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_rgb24 += 48; + } +} + +void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514}; + __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C}; + __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_raw, 0); + src1 = __lsx_vld(src_raw, 16); + src2 = __lsx_vld(src_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lsx_vshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_raw += 48; + } +} + +void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, reg2, dst0; + __m128i const_66 = __lsx_vldi(66); + __m128i const_129 = __lsx_vldi(129); + __m128i const_25 = __lsx_vldi(25); + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb1555, 0); + src1 = __lsx_vld(src_argb1555, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lsx_vpackod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_argb1555 += 32; + } +} + +void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0, + next_argb1555, 16, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + nexb = __lsx_vandi_b(tmp2, 0x1F); + tmpg = __lsx_vsrli_b(tmp0, 5); + nexg = __lsx_vsrli_b(tmp2, 5); + reg0 = __lsx_vandi_b(tmp1, 0x03); + reg2 = __lsx_vandi_b(tmp3, 0x03); + reg0 = __lsx_vslli_b(reg0, 3); + reg2 = __lsx_vslli_b(reg2, 3); + tmpg = __lsx_vor_v(tmpg, reg0); + nexg = __lsx_vor_v(nexg, reg2); + reg1 = __lsx_vandi_b(tmp1, 0x7C); + reg3 = __lsx_vandi_b(tmp3, 0x7C); + tmpr = __lsx_vsrli_b(reg1, 2); + nexr = __lsx_vsrli_b(reg3, 2); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vslli_b(tmpg, 3); + reg2 = __lsx_vslli_b(tmpr, 3); + tmpb = __lsx_vsrli_b(tmpb, 2); + tmpg = __lsx_vsrli_b(tmpg, 2); + tmpr = __lsx_vsrli_b(tmpr, 2); + tmpb = __lsx_vor_v(reg0, tmpb); + tmpg = __lsx_vor_v(reg1, tmpg); + tmpr = __lsx_vor_v(reg2, tmpr); + reg0 = __lsx_vslli_b(nexb, 3); + reg1 = __lsx_vslli_b(nexg, 3); + reg2 = __lsx_vslli_b(nexr, 3); + nexb = __lsx_vsrli_b(nexb, 2); + nexg = __lsx_vsrli_b(nexg, 2); + nexr = __lsx_vsrli_b(nexr, 2); + nexb = __lsx_vor_v(reg0, nexb); + nexg = __lsx_vor_v(reg1, nexg); + nexr = __lsx_vor_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_argb1555 += 32; + next_argb1555 += 32; + } +} + +void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1; + __m128i tmp0, tmp1, tmpb, tmpg, tmpr; + __m128i reg0, reg1, dst0; + __m128i const_66 = __lsx_vldi(66); + __m128i const_129 = __lsx_vldi(129); + __m128i const_25 = __lsx_vldi(25); + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb565, 0); + src1 = __lsx_vld(src_rgb565, 16); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + reg0 = __lsx_vsrli_b(tmpr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lsx_vpackod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_rgb565 += 32; + } +} + +void RGB565ToUVRow_LSX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0, + next_rgb565, 16, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lsx_vandi_b(tmp0, 0x1F); + tmpr = __lsx_vandi_b(tmp1, 0xF8); + nexb = __lsx_vandi_b(tmp2, 0x1F); + nexr = __lsx_vandi_b(tmp3, 0xF8); + reg1 = __lsx_vandi_b(tmp1, 0x07); + reg3 = __lsx_vandi_b(tmp3, 0x07); + reg0 = __lsx_vsrli_b(tmp0, 5); + reg1 = __lsx_vslli_b(reg1, 3); + reg2 = __lsx_vsrli_b(tmp2, 5); + reg3 = __lsx_vslli_b(reg3, 3); + tmpg = __lsx_vor_v(reg1, reg0); + nexg = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vslli_b(tmpb, 3); + reg1 = __lsx_vsrli_b(tmpb, 2); + reg2 = __lsx_vslli_b(nexb, 3); + reg3 = __lsx_vsrli_b(nexb, 2); + tmpb = __lsx_vor_v(reg1, reg0); + nexb = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vslli_b(tmpg, 2); + reg1 = __lsx_vsrli_b(tmpg, 4); + reg2 = __lsx_vslli_b(nexg, 2); + reg3 = __lsx_vsrli_b(nexg, 4); + tmpg = __lsx_vor_v(reg1, reg0); + nexg = __lsx_vor_v(reg2, reg3); + reg0 = __lsx_vsrli_b(tmpr, 5); + reg2 = __lsx_vsrli_b(nexr, 5); + tmpr = __lsx_vor_v(tmpr, reg0); + nexr = __lsx_vor_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgb565 += 32; + next_rgb565 += 32; + } +} + +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, dst0; + __m128i const_129 = __lsx_vldi(129); + __m128i const_br = {0x4219421942194219, 0x4219421942194219}; + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C}; + __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; + __m128i shuff2 = {0x000A000700040001, 0x001600130010000D}; + __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb24, 0); + src1 = __lsx_vld(src_rgb24, 16); + src2 = __lsx_vld(src_rgb24, 32); + tmp0 = __lsx_vshuf_b(src1, src0, shuff0); + tmp1 = __lsx_vshuf_b(src1, src2, shuff1); + tmp2 = __lsx_vshuf_b(src1, src0, shuff2); + tmp3 = __lsx_vshuf_b(src1, src2, shuff3); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lsx_vpickod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_rgb24 += 48; + } +} + +void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; + int len = width / 16; + __m128i src0, src1, src2; + __m128i nex0, nex1, nex2, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18}; + __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908}; + __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; + __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; + __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A}; + __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_rgb24, 0); + src1 = __lsx_vld(src_rgb24, 16); + src2 = __lsx_vld(src_rgb24, 32); + nex0 = __lsx_vld(next_rgb24, 0); + nex1 = __lsx_vld(next_rgb24, 16); + nex2 = __lsx_vld(next_rgb24, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgb24 += 48; + next_rgb24 += 48; + } +} + +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, dst0; + __m128i const_129 = __lsx_vldi(129); + __m128i const_br = {0x1942194219421942, 0x1942194219421942}; + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C}; + __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; + __m128i shuff2 = {0x000A000700040001, 0x001600130010000D}; + __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_raw, 0); + src1 = __lsx_vld(src_raw, 16); + src2 = __lsx_vld(src_raw, 32); + tmp0 = __lsx_vshuf_b(src1, src0, shuff0); + tmp1 = __lsx_vshuf_b(src1, src2, shuff1); + tmp2 = __lsx_vshuf_b(src1, src0, shuff2); + tmp3 = __lsx_vshuf_b(src1, src2, shuff3); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_raw += 48; + } +} + +void RAWToUVRow_LSX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_raw = src_raw + src_stride_raw; + int len = width / 16; + __m128i src0, src1, src2; + __m128i nex0, nex1, nex2, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18}; + __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908}; + __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19}; + __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908}; + __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A}; + __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_raw, 0); + src1 = __lsx_vld(src_raw, 16); + src2 = __lsx_vld(src_raw, 32); + nex0 = __lsx_vld(next_raw, 0); + nex1 = __lsx_vld(next_raw, 16); + nex2 = __lsx_vld(next_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_raw += 48; + next_raw += 48; + } +} + +void NV12ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_vu = __lsx_vld(src_uv, 0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 8; + src_uv += 8; + } +} + +void NV12ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_vu = __lsx_vld(src_uv, 0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + out_b = __lsx_vsrli_h(out_b, 3); + out_g = __lsx_vsrli_h(out_g, 2); + out_r = __lsx_vsrli_h(out_r, 3); + out_g = __lsx_vslli_h(out_g, 5); + out_r = __lsx_vslli_h(out_r, 11); + out_r = __lsx_vor_v(out_r, out_g); + out_r = __lsx_vor_v(out_r, out_b); + __lsx_vst(out_r, dst_rgb565, 0); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i vec_y, vec_uv; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_ubvr, vec_ugvg; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_uv = __lsx_vld(src_vu, 0); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g, + out_b); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 8; + src_vu += 8; + } +} + +void SobelRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0; + __m128i out0, out1, out2, out3; + __m128i alpha = __lsx_vldi(0xFF); + __m128i shuff0 = {0x1001010110000000, 0x1003030310020202}; + __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04); + __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04); + __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_sobelx, 0); + src1 = __lsx_vld(src_sobely, 0); + tmp0 = __lsx_vsadd_bu(src0, src1); + DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha, + tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3); + __lsx_vst(out0, dst_argb, 0); + __lsx_vst(out1, dst_argb, 16); + __lsx_vst(out2, dst_argb, 32); + __lsx_vst(out3, dst_argb, 48); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1); + DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3); + dst0 = __lsx_vsadd_bu(src0, src2); + dst1 = __lsx_vsadd_bu(src1, src3); + __lsx_vst(dst0, dst_y, 0); + __lsx_vst(dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_LSX(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m128i src_r, src_b, src_g; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + src_r = __lsx_vld(src_sobelx, 0); + src_b = __lsx_vld(src_sobely, 0); + src_g = __lsx_vsadd_bu(src_r, src_b); + tmp0 = __lsx_vilvl_b(src_g, src_b); + tmp1 = __lsx_vilvh_b(src_g, src_b); + tmp2 = __lsx_vilvl_b(alpha, src_r); + tmp3 = __lsx_vilvh_b(alpha, src_r); + dst0 = __lsx_vilvl_h(tmp2, tmp0); + dst1 = __lsx_vilvh_h(tmp2, tmp0); + dst2 = __lsx_vilvl_h(tmp3, tmp1); + dst3 = __lsx_vilvh_h(tmp3, tmp1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1; + __m128i const_128 = __lsx_vldi(0x480); + __m128i const_150 = __lsx_vldi(0x96); + __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); + reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lsx_vpickod_b(reg1, reg0); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_argb += 64; + } +} + +void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1; + __m128i const_129 = __lsx_vldi(0x81); + __m128i const_br = {0x1942194219421942, 0x1942194219421942}; + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_bgra += 64; + } +} + +void BGRAToUVRow_LSX(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_bgra = src_bgra + src_stride_bgra; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + tmpb = __lsx_vpickod_b(tmp2, tmp0); + tmpr = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickod_b(tmp3, tmp1); + tmp0 = __lsx_vpickod_b(nex1, nex0); + tmp1 = __lsx_vpickev_b(nex1, nex0); + tmp2 = __lsx_vpickod_b(nex3, nex2); + tmp3 = __lsx_vpickev_b(nex3, nex2); + nexb = __lsx_vpickod_b(tmp2, tmp0); + nexr = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickod_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_bgra += 64; + next_bgra += 64; + } +} + +void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1; + __m128i const_129 = __lsx_vldi(0x81); + __m128i const_br = {0x1942194219421942, 0x1942194219421942}; + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129); + reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_abgr += 64; + } +} + +void ABGRToUVRow_LSX(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_abgr = src_abgr + src_stride_abgr; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + tmpb = __lsx_vpickod_b(tmp2, tmp0); + tmpr = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vpickev_b(nex1, nex0); + tmp1 = __lsx_vpickod_b(nex1, nex0); + tmp2 = __lsx_vpickev_b(nex3, nex2); + tmp3 = __lsx_vpickod_b(nex3, nex2); + nexb = __lsx_vpickod_b(tmp2, tmp0); + nexr = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickev_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_abgr += 64; + next_abgr += 64; + } +} + +void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1; + __m128i const_129 = __lsx_vldi(0x81); + __m128i const_br = {0x4219421942194219, 0x4219421942194219}; + __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129); + reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129); + reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + __lsx_vst(dst0, dst_y, 0); + dst_y += 16; + src_rgba += 64; + } +} + +void RGBAToUVRow_LSX(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgba = src_rgba + src_stride_rgba; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_112 = __lsx_vldi(0x438); + __m128i const_74 = __lsx_vldi(0x425); + __m128i const_38 = __lsx_vldi(0x413); + __m128i const_94 = __lsx_vldi(0x42F); + __m128i const_18 = __lsx_vldi(0x409); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickev_b(src1, src0); + tmp2 = __lsx_vpickod_b(src3, src2); + tmp3 = __lsx_vpickev_b(src3, src2); + tmpr = __lsx_vpickod_b(tmp2, tmp0); + tmpb = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickod_b(tmp3, tmp1); + tmp0 = __lsx_vpickod_b(nex1, nex0); + tmp1 = __lsx_vpickev_b(nex1, nex0); + tmp2 = __lsx_vpickod_b(nex3, nex2); + tmp3 = __lsx_vpickev_b(nex3, nex2); + nexr = __lsx_vpickod_b(tmp2, tmp0); + nexb = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickod_b(tmp3, tmp1); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_rgba += 64; + next_rgba += 64; + } +} + +void ARGBToUVJRow_LSX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_argb = src_argb + src_stride_argb; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i nex0, nex1, nex2, nex3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, dst0; + __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m128i const_63 = __lsx_vldi(0x43F); + __m128i const_42 = __lsx_vldi(0x42A); + __m128i const_21 = __lsx_vldi(0x415); + __m128i const_53 = __lsx_vldi(0x435); + __m128i const_10 = __lsx_vldi(0x40A); + __m128i const_8080 = {0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb, + 48, nex0, nex1, nex2, nex3); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp2 = __lsx_vpickev_b(src3, src2); + tmp3 = __lsx_vpickod_b(src3, src2); + tmpr = __lsx_vpickod_b(tmp2, tmp0); + tmpb = __lsx_vpickev_b(tmp2, tmp0); + tmpg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vpickev_b(nex1, nex0); + tmp1 = __lsx_vpickod_b(nex1, nex0); + tmp2 = __lsx_vpickev_b(nex3, nex2); + tmp3 = __lsx_vpickod_b(nex3, nex2); + nexr = __lsx_vpickod_b(tmp2, tmp0); + nexb = __lsx_vpickev_b(tmp2, tmp0); + nexg = __lsx_vpickev_b(tmp3, tmp1); + tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb); + tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb); + tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg); + tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg); + reg0 = __lsx_vaddwev_h_bu(tmpr, nexr); + reg1 = __lsx_vaddwod_h_bu(tmpr, nexr); + tmpb = __lsx_vavgr_hu(tmp0, tmp1); + tmpg = __lsx_vavgr_hu(tmp2, tmp3); + tmpr = __lsx_vavgr_hu(reg0, reg1); + reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb); + reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr); + reg0 = __lsx_vmsub_h(reg0, const_42, tmpg); + reg1 = __lsx_vmsub_h(reg1, const_53, tmpg); + reg0 = __lsx_vmsub_h(reg0, const_21, tmpr); + reg1 = __lsx_vmsub_h(reg1, const_10, tmpb); + dst0 = __lsx_vpickod_b(reg1, reg0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + dst_u += 8; + dst_v += 8; + src_argb += 64; + next_argb += 64; + } +} + +void I444ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r; + __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x480); + __m128i alpha = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_u = __lsx_vld(src_u, 0); + vec_v = __lsx_vld(src_v, 0); + vec_yl = __lsx_vilvl_b(vec_y, vec_y); + vec_ul = __lsx_vilvl_b(zero, vec_u); + vec_vl = __lsx_vilvl_b(zero, vec_v); + I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, + out_b, out_g, out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + vec_yh = __lsx_vilvh_b(vec_y, vec_y); + vec_uh = __lsx_vilvh_b(zero, vec_u); + vec_vh = __lsx_vilvh_b(zero, vec_v); + I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb, + out_b, out_g, out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_u += 16; + src_v += 16; + } +} + +void I400ToARGBRow_LSX(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_y, vec_yl, vec_yh, out0; + __m128i y_ev, y_od, dst0, dst1, dst2, dst3; + __m128i temp0, temp1; + __m128i alpha = __lsx_vldi(0xFF); + __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]); + __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + vec_yl = __lsx_vilvl_b(vec_y, vec_y); + y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg); + y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg); + y_ev = __lsx_vsrai_w(y_ev, 16); + y_od = __lsx_vsrai_w(y_od, 16); + y_ev = __lsx_vadd_w(y_ev, vec_yb); + y_od = __lsx_vadd_w(y_od, vec_yb); + y_ev = __lsx_vsrai_w(y_ev, 6); + y_od = __lsx_vsrai_w(y_od, 6); + y_ev = __lsx_vclip255_w(y_ev); + y_od = __lsx_vclip255_w(y_od); + out0 = __lsx_vpackev_h(y_od, y_ev); + temp0 = __lsx_vpackev_b(out0, out0); + temp1 = __lsx_vpackev_b(alpha, out0); + dst0 = __lsx_vilvl_h(temp1, temp0); + dst1 = __lsx_vilvh_h(temp1, temp0); + vec_yh = __lsx_vilvh_b(vec_y, vec_y); + y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg); + y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg); + y_ev = __lsx_vsrai_w(y_ev, 16); + y_od = __lsx_vsrai_w(y_od, 16); + y_ev = __lsx_vadd_w(y_ev, vec_yb); + y_od = __lsx_vadd_w(y_od, vec_yb); + y_ev = __lsx_vsrai_w(y_ev, 6); + y_od = __lsx_vsrai_w(y_od, 6); + y_ev = __lsx_vclip255_w(y_ev); + y_od = __lsx_vclip255_w(y_od); + out0 = __lsx_vpackev_h(y_od, y_ev); + temp0 = __lsx_vpackev_b(out0, out0); + temp1 = __lsx_vpackev_b(alpha, out0); + dst2 = __lsx_vilvl_h(temp1, temp0); + dst3 = __lsx_vilvh_h(temp1, temp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_y += 16; + } +} + +void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m128i vec_y, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i alpha = __lsx_vldi(0xFF); + + for (x = 0; x < len; x++) { + vec_y = __lsx_vld(src_y, 0); + tmp0 = __lsx_vilvl_b(vec_y, vec_y); + tmp1 = __lsx_vilvh_b(vec_y, vec_y); + tmp2 = __lsx_vilvl_b(alpha, vec_y); + tmp3 = __lsx_vilvh_b(alpha, vec_y); + dst0 = __lsx_vilvl_h(tmp2, tmp0); + dst1 = __lsx_vilvh_h(tmp2, tmp0); + dst2 = __lsx_vilvl_h(tmp3, tmp1); + dst3 = __lsx_vilvh_h(tmp3, tmp1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + src_y += 16; + } +} + +void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i src0, vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_yuy2, 0); + vec_y = __lsx_vpickev_b(src0, src0); + vec_vu = __lsx_vpickod_b(src0, src0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_yuy2 += 16; + } +} + +void UYVYToARGBRow_LSX(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 8; + __m128i src0, vec_y, vec_vu; + __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb; + __m128i vec_vrub, vec_vgug; + __m128i out_b, out_g, out_r; + __m128i const_80 = __lsx_vldi(0x480); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub); + vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_uyvy, 0); + vec_y = __lsx_vpickod_b(src0, src0); + vec_vu = __lsx_vpickev_b(src0, src0); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g, + out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_uyvy += 16; + } +} + +void InterpolateRow_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int x; + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* nex_ptr = src_ptr + src_stride; + uint16_t y_fractions; + int len = width / 32; + __m128i src0, src1, nex0, nex1; + __m128i dst0, dst1, y_frac; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i const_128 = __lsx_vldi(0x480); + + if (y1_fraction == 0) { + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + __lsx_vst(src0, dst_ptr, 0); + __lsx_vst(src1, dst_ptr, 16); + src_ptr += 32; + dst_ptr += 32; + } + return; + } + + if (y1_fraction == 128) { + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); + dst0 = __lsx_vavgr_bu(src0, nex0); + dst1 = __lsx_vavgr_bu(src1, nex1); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 32; + nex_ptr += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = __lsx_vreplgr2vr_h(y_fractions); + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1); + tmp0 = __lsx_vilvl_b(nex0, src0); + tmp1 = __lsx_vilvh_b(nex0, src0); + tmp2 = __lsx_vilvl_b(nex1, src1); + tmp3 = __lsx_vilvh_b(nex1, src1); + tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac); + tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac); + tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac); + tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac); + dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8); + dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 32; + nex_ptr += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + int len = width / 4; + __m128i dst0 = __lsx_vreplgr2vr_w(v32); + + for (x = 0; x < len; x++) { + __lsx_vst(dst0, dst_argb, 0); + dst_argb += 16; + } +} + +void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2; + __m128i dst0, dst1, dst2; + __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06}; + __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A}; + __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1); + src2 = __lsx_vld(src_raw, 32); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1); + dst2 = __lsx_vshuf_b(src1, src2, shuf2); + dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E); + __lsx_vst(dst0, dst_rgb24, 0); + __lsx_vst(dst1, dst_rgb24, 16); + __lsx_vst(dst2, dst_rgb24, 32); + dst_rgb24 += 48; + src_raw += 48; + } +} + +void MergeUVRow_LSX(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1); + dst0 = __lsx_vilvl_b(src1, src0); + dst1 = __lsx_vilvh_b(src1, src0); + __lsx_vst(dst0, dst_uv, 0); + __lsx_vst(dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vpickod_b(src3, src2); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst_a, 0); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_LSX(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, dst0, dst1; + __m128i reg0, reg1, reg2, reg3; + __m128i a0, a1, a2, a3; + __m128i const_256 = __lsx_vldi(0x500); + __m128i zero = __lsx_vldi(0); + __m128i alpha = __lsx_vldi(0xFF); + __m128i control = {0xFF000000FF000000, 0xFF000000FF000000}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf4i_b(src0, 0xFF); + tmp1 = __lsx_vshuf4i_b(src1, 0xFF); + a0 = __lsx_vilvl_b(zero, tmp0); + a1 = __lsx_vilvh_b(zero, tmp0); + a2 = __lsx_vilvl_b(zero, tmp1); + a3 = __lsx_vilvh_b(zero, tmp1); + reg0 = __lsx_vilvl_b(zero, src2); + reg1 = __lsx_vilvh_b(zero, src2); + reg2 = __lsx_vilvl_b(zero, src3); + reg3 = __lsx_vilvh_b(zero, src3); + DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2, + const_256, a3, a0, a1, a2, a3); + DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1, + reg2, reg3); + DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1); + dst0 = __lsx_vsadd_bu(dst0, src0); + dst1 = __lsx_vsadd_bu(dst1, src1); + dst0 = __lsx_vbitsel_v(dst0, alpha, control); + dst1 = __lsx_vbitsel_v(dst1, alpha, control); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_LSX(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec_size = __lsx_vreplgr2vr_b(interval_size); + __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset); + __m128i vec_scale = __lsx_vreplgr2vr_w(scale); + __m128i zero = __lsx_vldi(0); + __m128i control = {0xFF000000FF000000, 0xFF000000FF000000}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48, + src0, src1, src2, src3); + reg0 = __lsx_vilvl_b(zero, src0); + reg1 = __lsx_vilvh_b(zero, src0); + reg2 = __lsx_vilvl_b(zero, src1); + reg3 = __lsx_vilvh_b(zero, src1); + reg4 = __lsx_vilvl_b(zero, src2); + reg5 = __lsx_vilvh_b(zero, src2); + reg6 = __lsx_vilvl_b(zero, src3); + reg7 = __lsx_vilvh_b(zero, src3); + tmp0 = __lsx_vilvl_h(zero, reg0); + tmp1 = __lsx_vilvh_h(zero, reg0); + tmp2 = __lsx_vilvl_h(zero, reg1); + tmp3 = __lsx_vilvh_h(zero, reg1); + tmp4 = __lsx_vilvl_h(zero, reg2); + tmp5 = __lsx_vilvh_h(zero, reg2); + tmp6 = __lsx_vilvl_h(zero, reg3); + tmp7 = __lsx_vilvh_h(zero, reg3); + DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, + tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, + tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, + tmp7, tmp6, 16, reg0, reg1, reg2, reg3); + dst0 = __lsx_vpickev_b(reg1, reg0); + dst1 = __lsx_vpickev_b(reg3, reg2); + tmp0 = __lsx_vilvl_h(zero, reg4); + tmp1 = __lsx_vilvh_h(zero, reg4); + tmp2 = __lsx_vilvl_h(zero, reg5); + tmp3 = __lsx_vilvh_h(zero, reg5); + tmp4 = __lsx_vilvl_h(zero, reg6); + tmp5 = __lsx_vilvh_h(zero, reg6); + tmp6 = __lsx_vilvl_h(zero, reg7); + tmp7 = __lsx_vilvh_h(zero, reg7); + DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale, + tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale, + tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16, + tmp7, tmp6, 16, reg0, reg1, reg2, reg3); + dst2 = __lsx_vpickev_b(reg1, reg0); + dst3 = __lsx_vpickev_b(reg3, reg2); + DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size, + dst3, vec_size, dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2, + vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3); + DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2, + src2, control, dst3, src3, control, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + __lsx_vst(dst2, dst_argb, 32); + __lsx_vst(dst3, dst_argb, 48); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0, dst1; + __m128i tmp_b, tmp_g, tmp_r, tmp_a; + __m128i reg_b, reg_g, reg_r, reg_a; + __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0); + __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4); + __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8); + __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12); + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r, + src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r, + src1, matrix_a, reg_b, reg_g, reg_r, reg_a); + DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a, + tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a, + reg_a, reg_b, reg_g, reg_r, reg_a); + DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b, + tmp_g, tmp_r, tmp_a); + DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b, + reg_g, reg_r, reg_a); + DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r, + tmp_a) + DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r, + reg_a) + DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a, + tmp_a, tmp_b, tmp_g, tmp_r, tmp_a); + tmp0 = __lsx_vpackev_b(tmp_g, tmp_b); + tmp1 = __lsx_vpackev_b(tmp_a, tmp_r); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_LSX(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3); + __lsx_vst(dst0, dst_u, 0); + __lsx_vst(dst1, dst_u, 16); + __lsx_vst(dst2, dst_v, 0); + __lsx_vst(dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) { + int x; + int len = width / 16; + __m128i dst0 = __lsx_vreplgr2vr_b(v8); + + for (x = 0; x < len; x++) { + __lsx_vst(dst0, dst, 0); + dst += 16; + } +} + +void MirrorSplitUVRow_LSX(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E}; + __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F}; + + src_uv += (width << 1); + for (x = 0; x < len; x++) { + src_uv -= 64; + DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2, + src3, src0, src1); + DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0, + shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst_v, 0); + __lsx_vst(dst1, dst_v, 16); + __lsx_vst(dst2, dst_u, 0); + __lsx_vst(dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void HalfFloatRow_LSX(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int x; + int len = width / 32; + float mult = 1.9259299444e-34f * scale; + __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0); + __m128i zero = __lsx_vldi(0); + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4, + reg6); + DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5, + reg7); + DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult, + reg3, vec_mult, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult, + reg7, vec_mult, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13, + (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13, + (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, + dst0, dst1, dst2, dst3); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + src += 32; + dst += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc index d8726d09..362fd1cf 100644 --- a/files/source/row_mmi.cc +++ b/files/source/row_mmi.cc @@ -21,6 +21,8 @@ extern "C" { // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +// clang-format off + void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -603,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -611,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -624,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -637,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -650,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -669,35 +671,38 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ARGBToUVRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -707,15 +712,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -725,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -741,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -752,15 +759,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -770,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -786,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -797,15 +806,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -815,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -831,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -842,15 +853,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -860,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -888,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -898,16 +911,17 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -915,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -928,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -941,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -954,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -973,35 +987,38 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void BGRAToUVRow_MMI(const uint8_t* src_rgb0, +void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1011,15 +1028,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1029,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1045,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1056,15 +1075,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1074,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1090,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1101,15 +1122,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1119,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1135,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1146,15 +1169,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1164,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1192,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1202,16 +1227,17 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1219,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1232,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1245,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1258,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1277,35 +1303,38 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ABGRToUVRow_MMI(const uint8_t* src_rgb0, +void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002F00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1315,15 +1344,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1333,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1349,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1360,15 +1391,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1378,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1394,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1405,15 +1438,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1423,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1439,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1450,15 +1485,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1468,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1496,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1506,16 +1543,17 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1523,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1536,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1549,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1562,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1581,35 +1619,38 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGBAToUVRow_MMI(const uint8_t* src_rgb0, +void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1619,15 +1660,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1637,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1653,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1664,15 +1707,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1682,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1698,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1709,15 +1754,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1727,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1743,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1754,15 +1801,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1772,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1800,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1810,16 +1859,17 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1827,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1841,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1855,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1869,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1889,35 +1939,38 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, +void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1929,15 +1982,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1949,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -1965,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1978,15 +2033,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1998,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2014,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2027,15 +2084,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2047,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2063,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2076,15 +2135,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2096,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2124,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2134,16 +2195,17 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -2151,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2165,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2179,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2193,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2213,35 +2275,38 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RAWToUVRow_MMI(const uint8_t* src_rgb0, +void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2253,15 +2318,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2273,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2289,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2302,15 +2369,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2322,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2338,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2351,15 +2420,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2371,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2387,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2400,15 +2471,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2420,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2448,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2458,23 +2531,24 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } -void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; - const uint64_t shift = 0x07; - const uint64_t value = 0x0040; + const uint64_t shift = 0x08; + const uint64_t value = 0x80; const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00010026004B000FULL; + const uint64_t mask1 = 0x0001004D0096001DULL; __asm__ volatile( "1: \n\t" @@ -2544,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), [width] "r"(width) : "memory"); } -void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2558,22 +2632,22 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, uint64_t src_rgb1; uint64_t ftmp[12]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x002b0054007f0002; - const uint64_t mask_v = 0x0002007f006b0014; + const uint64_t mask_u = 0x0015002a003f0002; + const uint64_t mask_v = 0x0002003f0035000a; __asm__ volatile( "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" @@ -2581,16 +2655,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2607,16 +2681,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" @@ -2624,16 +2698,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2650,16 +2724,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" @@ -2667,16 +2741,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2693,16 +2767,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" @@ -2710,16 +2784,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2748,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2759,10 +2833,10 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [sixteen] "f"(0x10) : "memory"); } @@ -4052,10 +4126,10 @@ void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uint64_t tmp0, tmp1; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x00400026004B000FULL; + const uint64_t mask2 = 0x0080004D0096001DULL; const uint64_t mask3 = 0xFF000000FF000000ULL; const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x07; + const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" @@ -4312,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, +void ARGBMultiplyRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4348,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } -void ARGBAddRow_MMI(const uint8_t* src_argb0, +void ARGBAddRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4375,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } -void ARGBSubtractRow_MMI(const uint8_t* src_argb0, +void ARGBSubtractRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4402,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } @@ -4778,7 +4852,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "memory"); } -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x55; @@ -4912,10 +4988,10 @@ void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { : "memory"); } -void MirrorUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_MMI(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { uint64_t src0, src1, dest0, dest1; const uint64_t mask0 = 0x00ff00ff00ff00ffULL; const uint64_t mask1 = 0x1b; @@ -5476,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { : "memory"); } -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb0, +void ARGBBlendRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5532,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift), [width] "r"(width) @@ -6034,6 +6110,1730 @@ void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { : "memory"); } +void I444ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + __asm__ volatile ( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + "punpcklbh %[u], %[u], %[zero] \n\t"//u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + "punpcklbh %[v], %[v], %[zero] \n\t"//v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +// Also used for 420 +void I422ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t"//v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +// 10 bit YUV to ARGB +void I210ToARGBRow_MMI(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" + + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask), [two]"f"(0x02), + [mask1]"f"(0x00ff00ff00ff00ff) + : "memory" + ); +} + +void I422AlphaToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v,a; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), [a]"=&f"(a), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [a_ptr]"r"(src_a), [zero]"f"(0x00), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); +} + +void I422ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t mask = 0xff00ff00ff00ff00ULL; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(mask), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); +} + +void I422ToARGB4444Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" + + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), + [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), + [alpha]"f"(-1) + : "memory" + ); +} + +void I422ToARGB1555Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" + + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [mask3]"f"(0x800000008000), + [lmove5]"f"(0x5) + : "memory" + ); +} + +void I422ToRGB565Row_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7), + [lmove5]"f"(0x5) + : "memory" + ); +} + +void NV12ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); +} + +void NV21ToARGBRow_MMI(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); +} + +void NV12ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [lmove1]"f"(0x18), + [one]"f"(0x1), [rmove1]"f"(0x8) + : "memory" + ); +} + +void NV21ToRGB24Row_MMI(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); +} + +void NV12ToRGB565Row_MMI(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7) + : "memory" + ); +} + +void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); +} + +void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" + + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); +} + +void I422ToRGBARow_MMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t y, u, v; + uint64_t b_vec, g_vec, r_vec, temp; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" + + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" + + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" + + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" + + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" + + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [alpha]"f"(-1) + : "memory" + ); +} + +void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { + __asm__ volatile ( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32]"+&f"(v32) + : [dst_ptr]"r"(dst_argb), [width]"r"(width) + : "memory" + ); +} +// clang-format on + +// 10 bit YUV to ARGB #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc index 5c0239a3..b7d5bb5e 100644 --- a/files/source/row_msa.cc +++ b/files/source/row_msa.cc @@ -24,16 +24,14 @@ extern "C" { #define ALPHA_VAL (-1) // Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ - { \ - ub = __msa_fill_w(yuvconst->kUVToB[0]); \ - vr = __msa_fill_w(yuvconst->kUVToR[1]); \ - ug = __msa_fill_w(yuvconst->kUVToG[0]); \ - vg = __msa_fill_w(yuvconst->kUVToG[1]); \ - bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ - bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ - br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ - yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \ } // Load YUV 422 pixel data @@ -70,54 +68,52 @@ extern "C" { } // Convert 8 pixels of YUV 420 to RGB. -#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ - { \ - v8i16 vec0_m, vec1_m; \ - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ - v4i32 reg5_m, reg6_m, reg7_m; \ - v16i8 zero_m = {0}; \ - \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ - vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ - reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ - reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ - reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ - reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ - reg0_m *= yg; \ - reg1_m *= yg; \ - reg2_m *= ubvr; \ - reg3_m *= ubvr; \ - reg0_m = __msa_srai_w(reg0_m, 16); \ - reg1_m = __msa_srai_w(reg1_m, 16); \ - reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ - reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ - reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ - reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ - reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ - reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ - reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ - reg5_m = reg0_m - reg5_m; \ - reg6_m = reg1_m - reg6_m; \ - reg2_m = reg0_m - reg2_m; \ - reg3_m = reg1_m - reg3_m; \ - reg7_m = reg0_m - reg7_m; \ - reg4_m = reg1_m - reg4_m; \ - reg5_m += bb; \ - reg6_m += bb; \ - reg7_m += bg; \ - reg4_m += bg; \ - reg2_m += br; \ - reg3_m += br; \ - reg5_m = __msa_srai_w(reg5_m, 6); \ - reg6_m = __msa_srai_w(reg6_m, 6); \ - reg7_m = __msa_srai_w(reg7_m, 6); \ - reg4_m = __msa_srai_w(reg4_m, 6); \ - reg2_m = __msa_srai_w(reg2_m, 6); \ - reg3_m = __msa_srai_w(reg3_m, 6); \ - CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ - out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ - out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ - out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 temp_m, zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \ + temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg0_m += yb; \ + reg1_m += yb; \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m + reg5_m; \ + reg6_m = reg1_m + reg6_m; \ + reg2_m = reg0_m + reg2_m; \ + reg3_m = reg1_m + reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ } // Pack and Store 8 ARGB values. @@ -155,11 +151,10 @@ extern "C" { } // Loads current and next row of ARGB input and averages it to calculate U and V -#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ { \ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v16u8 vec8_m, vec9_m; \ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ v8u16 reg8_m, reg9_m; \ \ @@ -195,81 +190,81 @@ extern "C" { reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - src0_m = (v16u8)__msa_ld_b((void*)s, 64); \ - src1_m = (v16u8)__msa_ld_b((void*)s, 80); \ - src2_m = (v16u8)__msa_ld_b((void*)s, 96); \ - src3_m = (v16u8)__msa_ld_b((void*)s, 112); \ - src4_m = (v16u8)__msa_ld_b((void*)t, 64); \ - src5_m = (v16u8)__msa_ld_b((void*)t, 80); \ - src6_m = (v16u8)__msa_ld_b((void*)t, 96); \ - src7_m = (v16u8)__msa_ld_b((void*)t, 112); \ - vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ - vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ - vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ - vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ - vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ - vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ - vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ - vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ - reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ - reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ - reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ - reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ - reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ - reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ - reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ - reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ - reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + reg8_m += const_0x0101; \ + reg9_m += const_0x0101; \ + reg0_m += const_0x0101; \ + reg1_m += const_0x0101; \ + argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ + argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ + argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ + argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ } -// Takes ARGB input and calculates U and V. #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, v_out, u_out) \ + shf0, shf1, shf2, shf3, shift, u_out, v_out) \ { \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ \ - vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_h(vec0_m, const1); \ - reg1_m = __msa_dotp_u_h(vec1_m, const1); \ - reg2_m = __msa_dotp_u_h(vec4_m, const1); \ - reg3_m = __msa_dotp_u_h(vec5_m, const1); \ - reg0_m += const3; \ - reg1_m += const3; \ - reg2_m += const3; \ - reg3_m += const3; \ - reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ - reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ - reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ - reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ - v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ - u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const0); \ + reg1_m = __msa_dotp_u_w(vec1_m, const0); \ + reg2_m = __msa_dotp_u_w(vec4_m, const0); \ + reg3_m = __msa_dotp_u_w(vec5_m, const0); \ + reg0_m += const1; \ + reg1_m += const1; \ + reg2_m += const1; \ + reg3_m += const1; \ + reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ + reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ + reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ + reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ + reg0_m = __msa_srl_w(reg0_m, shift); \ + reg1_m = __msa_srl_w(reg1_m, shift); \ + reg2_m = __msa_srl_w(reg2_m, shift); \ + reg3_m = __msa_srl_w(reg3_m, shift); \ + u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const1); \ + reg1_m = __msa_dotp_u_w(vec1_m, const1); \ + reg2_m = __msa_dotp_u_w(vec4_m, const1); \ + reg3_m = __msa_dotp_u_w(vec5_m, const1); \ + reg0_m += (v4u32)const3; \ + reg1_m += (v4u32)const3; \ + reg2_m += (v4u32)const3; \ + reg3_m += (v4u32)const3; \ + reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ + u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ + v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ } // Load I444 pixel data @@ -285,6 +280,34 @@ extern "C" { out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ } +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \ + v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \ + _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \ + _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \ + _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \ + _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \ + _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \ + _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \ + _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \ + _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \ + _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \ + _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \ + _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \ + _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \ + _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \ + _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \ + _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \ + _reg1 = const_8080 + const_112 * _reg0; \ + _reg3 = const_8080 + const_112 * _reg4; \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \ + _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \ + } + void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; @@ -302,6 +325,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { } } +void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + v8u16 src, dst; + v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; + src_uv += (width - 8) << 1; + for (x = 0; x < width; x += 8) { + src = LD_UH(src_uv); + dst = __msa_vshf_h(shuffler, src, src); + ST_UH(dst, dst_uv); + src_uv -= 16; + dst_uv += 16; + } +} + void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; @@ -376,20 +413,19 @@ void I422ToARGBRow_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_y += 8; src_u += 4; @@ -407,20 +443,19 @@ void I422ToRGBARow_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(alpha, vec0, vec1, vec2, dst_argb); src_y += 8; src_u += 4; @@ -440,12 +475,12 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, int64_t data_a; v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v4i32 zero = {0}; + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -454,8 +489,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); STOREARGB(vec0, vec1, vec2, src3, dst_argb); src_y += 8; @@ -476,17 +510,17 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, int64_t data_u, data_v; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 reg0, reg1, reg2, reg3; v2i64 zero = {0}; + v8i16 const_0x80 = __msa_ldi_h(0x80); v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -499,10 +533,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec3, vec4, vec5); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5); reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); @@ -529,24 +561,23 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec2, vec1); - vec0 = __msa_srai_h(vec0, 3); - vec1 = __msa_srai_h(vec1, 3); - vec2 = __msa_srai_h(vec2, 2); - vec1 = __msa_slli_h(vec1, 11); - vec2 = __msa_slli_h(vec2, 5); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + vec0 = __msa_srli_h(vec0, 3); + vec1 = __msa_srli_h(vec1, 2); + vec2 = __msa_srli_h(vec2, 3); + vec2 = __msa_slli_h(vec2, 11); + vec1 = __msa_slli_h(vec1, 5); vec0 |= vec1; dst0 = (v16u8)(vec2 | vec0); ST_UB(dst0, dst_rgb565); @@ -568,25 +599,24 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y, v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + v8u16 mask = (v8u16)__msa_fill_h(0x00F0); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - reg0 = (v8u16)__msa_srai_h(vec0, 4); - reg1 = (v8u16)__msa_srai_h(vec1, 4); - reg2 = (v8u16)__msa_srai_h(vec2, 4); - reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); - reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + reg0 = (v8u16)__msa_srli_h(vec0, 4); + reg2 = (v8u16)__msa_srli_h(vec2, 4); + reg1 = (v8u16)__msa_and_v(vec1, mask); + reg2 = (v8u16)__msa_slli_h(reg2, 8); reg1 |= const_0xF000; reg0 |= reg2; dst0 = (v16u8)(reg1 | reg0); @@ -608,23 +638,22 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y, v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - reg0 = (v8u16)__msa_srai_h(vec0, 3); - reg1 = (v8u16)__msa_srai_h(vec1, 3); - reg2 = (v8u16)__msa_srai_h(vec2, 3); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + reg0 = (v8u16)__msa_srli_h(vec0, 3); + reg1 = (v8u16)__msa_srli_h(vec1, 3); + reg2 = (v8u16)__msa_srli_h(vec2, 3); reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); reg1 |= const_0x8000; @@ -768,7 +797,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, } } -void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -779,10 +808,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); @@ -809,38 +838,39 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVRow_MSA(const uint8_t* src_argb0, +void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + const uint8_t* src_argb_next = src_argb + src_stride_argb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; v16u8 dst0, dst1; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -861,14 +891,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 = __msa_hadd_u_h(vec5, vec5); reg4 = __msa_hadd_u_h(vec0, vec0); reg5 = __msa_hadd_u_h(vec1, vec1); - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -889,12 +919,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 += __msa_hadd_u_h(vec5, vec5); reg4 += __msa_hadd_u_h(vec0, vec0); reg5 += __msa_hadd_u_h(vec1, vec1); - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); - reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg4 += const_0x0001; + reg5 += const_0x0001; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); reg6 = reg0 * const_0x70; reg7 = reg1 * const_0x70; reg8 = reg2 * const_0x4A; @@ -925,8 +961,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); - src_argb0 += 128; - src_argb0_next += 128; + src_argb += 128; + src_argb_next += 128; dst_u += 16; dst_v += 16; } @@ -1153,7 +1189,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb, } } -void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, +void ARGBMultiplyRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1164,7 +1200,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, v8i16 zero = {0}; for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); @@ -1186,13 +1222,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_argb); - src_argb0 += 16; + src_argb += 16; src_argb1 += 16; dst_argb += 16; } } -void ARGBAddRow_MSA(const uint8_t* src_argb0, +void ARGBAddRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1200,20 +1236,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } } -void ARGBSubtractRow_MSA(const uint8_t* src_argb0, +void ARGBSubtractRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1221,14 +1257,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_subs_u_b(src0, src2); dst1 = __msa_subs_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } @@ -1412,17 +1448,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, dst0, dst1; v8u16 reg0; - v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); - v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); - reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + reg0 = __msa_dotp_u_h(vec0, const_0x961D); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); @@ -1656,56 +1692,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16u8 dst0; - v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); - v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); - v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, reg2, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080); + v16u8 zero = (v16u8)__msa_ldi_b(0); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); - reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); - reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); - reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); - reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); - reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); - reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); - reg0 *= const_0x19; - reg1 *= const_0x19; - reg2 *= const_0x81; - reg3 *= const_0x81; - reg4 *= const_0x42; - reg5 *= const_0x42; - reg0 += reg2; - reg1 += reg3; - reg0 += reg4; - reg1 += reg5; - reg0 += const_0x1080; - reg1 += const_0x1080; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_y); + src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = const_1080 + const_25 * tmpb_r; + res1 = const_1080 + const_25 * tmpb_l; + res0 += const_129 * tmpg_r; + res1 += const_129 * tmpg_l; + res0 += const_66 * tmpr_r; + res1 += const_66 * tmpr_l; + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); src_argb1555 += 32; dst_y += 16; } @@ -1713,68 +1744,55 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v4u32 res0, res1, res2, res3; - v16u8 dst0; - v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); - v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); - v8i16 const_0x1080 = __msa_fill_h(0x1080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); - v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080); + v16u8 zero = __msa_ldi_b(0); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16); - vec0 = src0 & const_0x1F; - vec1 = src0 & const_0x7E0; - vec2 = src0 & const_0xF800; - vec3 = src1 & const_0x1F; - vec4 = src1 & const_0x7E0; - vec5 = src1 & const_0xF800; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); - reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); - reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); - reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); - reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); - reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); - reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); - vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); - vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); - vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); - vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); - vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); - vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); - vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); - res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); - res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); - res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); - res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); - res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); - res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); - res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); - res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); - res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); - res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); - res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); - res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); - vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); - vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); + src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0); + src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = const_1080 + const_25 * tmpb_r; + res1 = const_1080 + const_25 * tmpb_l; + res0 += const_129 * tmpg_r; + res1 += const_129 * tmpg_l; + res0 += const_66 * tmpr_r; + res1 += const_66 * tmpr_l; + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); src_rgb565 += 32; dst_y += 16; } } -void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1789,9 +1807,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1810,12 +1828,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } -void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1830,9 +1848,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1851,7 +1869,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } @@ -1865,69 +1883,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, const uint16_t* s = (const uint16_t*)src_argb1555; const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)s, 0); src1 = (v8u16)__msa_ld_b((void*)s, 16); src2 = (v8u16)__msa_ld_b((void*)t, 0); src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - vec2 += src2 & const_0x1F; - vec3 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); - vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); - reg0 = vec6 * const_0x70; - reg1 = vec0 * const_0x4A; - reg2 = vec2 * const_0x70; - reg3 = vec0 * const_0x5E; - reg0 += const_0x8080; - reg1 += vec2 * const_0x26; - reg2 += const_0x8080; - reg3 += vec6 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + nexg = (v16u8)__msa_srli_b(tmp2, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg2 = (v16u8)__msa_andi_b(tmp3, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + reg2 = (v16u8)__msa_slli_b(reg2, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + nexg = (v16u8)__msa_or_v(nexg, reg2); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + nexr = (v16u8)__msa_srli_b(reg3, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + reg0 = (v16u8)__msa_slli_b(nexb, 3); + reg1 = (v16u8)__msa_slli_b(nexg, 3); + reg2 = (v16u8)__msa_slli_b(nexr, 3); + nexb = (v16u8)__msa_srli_b(nexb, 2); + nexg = (v16u8)__msa_srli_b(nexg, 2); + nexr = (v16u8)__msa_srli_b(nexr, 2); + nexb = (v16u8)__msa_or_v(reg0, nexb); + nexg = (v16u8)__msa_or_v(reg1, nexg); + nexr = (v16u8)__msa_or_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; @@ -1946,68 +1956,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, const uint16_t* s = (const uint16_t*)src_rgb565; const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)s, 0); - src1 = (v8u16)__msa_ld_b((void*)s, 16); - src2 = (v8u16)__msa_ld_b((void*)t, 0); - src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x3F; - vec3 = src1 & const_0x3F; - vec2 += src2 & const_0x3F; - vec3 += src3 & const_0x3F; - vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - reg0 = vec3 * const_0x70; - reg1 = vec1 * const_0x4A; - reg2 = vec4 * const_0x70; - reg3 = vec1 * const_0x5E; - reg0 += const_32896; - reg1 += vec4 * const_0x26; - reg2 += const_32896; - reg3 += vec3 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + nexr = (v16u8)__msa_andi_b(tmp3, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + reg2 = (v16u8)__msa_srli_b(tmp2, 5); + reg3 = (v16u8)__msa_slli_b(reg3, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + reg2 = (v16u8)__msa_slli_b(nexb, 3); + reg3 = (v16u8)__msa_srli_b(nexb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + nexb = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + reg2 = (v16u8)__msa_slli_b(nexg, 2); + reg3 = (v16u8)__msa_srli_b(nexg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + reg2 = (v16u8)__msa_srli_b(nexr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + nexr = (v16u8)__msa_or_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; @@ -2017,26 +2016,27 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, } } -void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, +void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2085,10 +2085,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h((v8i16)reg0, 2); - reg1 = __msa_srai_h((v8i16)reg1, 2); - reg2 = __msa_srai_h((v8i16)reg2, 2); - reg3 = __msa_srai_h((v8i16)reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h((v8i16)reg0, 1); + reg1 = __msa_srai_h((v8i16)reg1, 1); + reg2 = __msa_srai_h((v8i16)reg2, 1); + reg3 = __msa_srai_h((v8i16)reg3, 1); vec4 = (v8u16)__msa_pckev_h(reg1, reg0); vec5 = (v8u16)__msa_pckev_h(reg3, reg2); vec6 = (v8u16)__msa_pckod_h(reg1, reg0); @@ -2122,26 +2126,27 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, } } -void RAWToUVRow_MSA(const uint8_t* src_rgb0, +void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2190,10 +2195,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h(reg0, 2); - reg1 = __msa_srai_h(reg1, 2); - reg2 = __msa_srai_h(reg2, 2); - reg3 = __msa_srai_h(reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h(reg0, 1); + reg1 = __msa_srai_h(reg1, 1); + reg2 = __msa_srai_h(reg2, 1); + reg3 = __msa_srai_h(reg3, 1); vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); @@ -2236,13 +2245,13 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 zero = {0}; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2251,8 +2260,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y, val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); @@ -2273,12 +2281,12 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, dst0; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); v16u8 zero = {0}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2287,8 +2295,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y, val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); vec0 = vec0 >> 3; vec1 = (vec1 >> 2) << 5; vec2 = (vec2 >> 3) << 11; @@ -2309,14 +2316,14 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v16u8 zero = {0}; v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2326,8 +2333,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y, src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); @@ -2416,27 +2422,27 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx, } } -void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); - v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); - v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); + v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); + v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); - ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); + ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); @@ -2444,19 +2450,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); @@ -2464,19 +2470,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); @@ -2484,81 +2490,143 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, +void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, vec2, vec3; - v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); - v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); - v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + v8u16 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3; + v8u16 dst0, dst1, dst2, dst3; + v16u8 zero = {0}; + v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); + v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); + v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); + v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); + v4i32 shift = __msa_fill_w(0x00000008); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)s, 32); - src3 = (v16u8)__msa_ld_b((void*)s, 48); - src4 = (v16u8)__msa_ld_b((void*)t, 0); - src5 = (v16u8)__msa_ld_b((void*)t, 16); - src6 = (v16u8)__msa_ld_b((void*)t, 32); - src7 = (v16u8)__msa_ld_b((void*)t, 48); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec0 = __msa_aver_u_b(src4, src6); - vec1 = __msa_aver_u_b(src5, src7); - src0 = (v16u8)__msa_ld_b((void*)s, 64); - src1 = (v16u8)__msa_ld_b((void*)s, 80); - src2 = (v16u8)__msa_ld_b((void*)s, 96); - src3 = (v16u8)__msa_ld_b((void*)s, 112); - src4 = (v16u8)__msa_ld_b((void*)t, 64); - src5 = (v16u8)__msa_ld_b((void*)t, 80); - src6 = (v16u8)__msa_ld_b((void*)t, 96); - src7 = (v16u8)__msa_ld_b((void*)t, 112); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec2 = __msa_aver_u_b(src4, src6); - vec3 = __msa_aver_u_b(src5, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); + src1 = __msa_ld_b((void*)s, 0); + src3 = __msa_ld_b((void*)s, 16); + src5 = __msa_ld_b((void*)t, 0); + src7 = __msa_ld_b((void*)t, 16); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 32); + src3 = __msa_ld_b((void*)s, 48); + src5 = __msa_ld_b((void*)t, 32); + src7 = __msa_ld_b((void*)t, 48); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst0, dst1); + + src1 = __msa_ld_b((void*)s, 64); + src3 = __msa_ld_b((void*)s, 80); + src5 = __msa_ld_b((void*)t, 64); + src7 = __msa_ld_b((void*)t, 80); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 96); + src3 = __msa_ld_b((void*)s, 112); + src5 = __msa_ld_b((void*)t, 96); + src7 = __msa_ld_b((void*)t, 112); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst2, dst3); + + dst0 = (v8u16)__msa_pckev_b(dst2, dst0); + dst1 = (v8u16)__msa_pckev_b(dst3, dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); s += 128; t += 128; dst_v += 16; @@ -2566,103 +2634,108 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, } } -void BGRAToUVRow_MSA(const uint8_t* src_rgb0, +void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); - s += 128; - t += 128; - dst_v += 16; - dst_u += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } -void ABGRToUVRow_MSA(const uint8_t* src_rgb0, +void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); - v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, src0, src1, src2, src3); - ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } -void RGBAToUVRow_MSA(const uint8_t* src_rgb0, +void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; + v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, + shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2674,54 +2747,57 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, int width) { int x; v16u8 src0, src1, src2, dst0, dst1; - v8u16 vec0, vec1, vec2; + v8i16 vec0, vec1, vec2; v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 zero = {0}; + v4i32 const_0x80 = __msa_fill_w(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); for (x = 0; x < width; x += 8) { READI444(src_y, src_u, src_v, src0, src1, src2); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); reg0 *= vec_yg; reg1 *= vec_yg; reg0 = __msa_srai_w(reg0, 16); reg1 = __msa_srai_w(reg1, 16); - reg4 = reg0 + vec_br; - reg5 = reg1 + vec_br; - reg2 = reg0 + vec_bg; - reg3 = reg1 + vec_bg; - reg0 += vec_bb; - reg1 += vec_bb; + reg0 += vec_yb; + reg1 += vec_yb; vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - reg0 -= reg6 * vec_ub; - reg1 -= reg7 * vec_ub; - reg2 -= reg6 * vec_ug; - reg3 -= reg7 * vec_ug; - reg4 -= reg8 * vec_vr; - reg5 -= reg9 * vec_vr; - reg2 -= reg8 * vec_vg; - reg3 -= reg9 * vec_vg; - reg0 = __msa_srai_w(reg0, 6); - reg1 = __msa_srai_w(reg1, 6); - reg2 = __msa_srai_w(reg2, 6); - reg3 = __msa_srai_w(reg3, 6); - reg4 = __msa_srai_w(reg4, 6); - reg5 = __msa_srai_w(reg5, 6); + reg6 -= const_0x80; + reg7 -= const_0x80; + reg8 -= const_0x80; + reg9 -= const_0x80; + tmp0 = reg0 + reg6 * vec_ub; + tmp1 = reg1 + reg7 * vec_ub; + tmp2 = reg0 + reg8 * vec_vr; + tmp3 = reg1 + reg9 * vec_vr; + tmp4 = reg6 * vec_ug; + tmp5 = reg7 * vec_ug; + tmp4 += reg8 * vec_vg; + tmp5 += reg9 * vec_vg; + tmp4 = reg0 - tmp4; + tmp5 = reg1 - tmp5; + reg0 = __msa_srai_w(tmp0, 6); + reg1 = __msa_srai_w(tmp1, 6); + reg2 = __msa_srai_w(tmp2, 6); + reg3 = __msa_srai_w(tmp3, 6); + reg4 = __msa_srai_w(tmp4, 6); + reg5 = __msa_srai_w(tmp5, 6); CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); @@ -2734,13 +2810,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, } } -void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { int x; +#if defined(__aarch64__) || defined(__arm__) + int ygb = yuvconstants->kUVBiasBGR[3]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; v8i16 vec0, vec1; v4i32 reg0, reg1, reg2, reg3; - v4i32 vec_yg = __msa_fill_w(0x4A35); - v8i16 vec_ygb = __msa_fill_h(0xFB78); + v4i32 vec_yg = __msa_fill_w(yg); + v8i16 vec_ygb = __msa_fill_h(ygb); v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 max = __msa_ldi_h(0xFF); v8i16 zero = {0}; @@ -2814,12 +2901,12 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 const_0x80 = __msa_ldi_h(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2827,8 +2914,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_yuy2 += 16; dst_argb += 32; @@ -2842,12 +2928,12 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; + v8i16 const_0x80 = __msa_ldi_h(0x80); v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2855,8 +2941,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_uyvy += 16; dst_argb += 32; @@ -3001,12 +3086,12 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, } } -void ARGBBlendRow_MSA(const uint8_t* src_argb0, +void ARGBBlendRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; - v16u8 src0, src1, src2, src3, dst0, dst1; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 vec8, vec9, vec10, vec11, vec12, vec13; v8u16 const_256 = (v8u16)__msa_ldi_h(256); @@ -3015,8 +3100,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); @@ -3051,16 +3136,16 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); - vec0 += vec8; - vec1 += vec9; - vec2 += vec10; - vec3 += vec11; dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); + dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); dst0 = __msa_bmnz_v(dst0, const_255, mask); dst1 = __msa_bmnz_v(dst1, const_255, mask); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } @@ -3082,7 +3167,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb, v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; v16i8 zero = {0}; - for (x = 0; x < width; x += 8) { + for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); @@ -3315,10 +3400,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { } } -void MirrorUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc index a12fa790..804ff839 100644 --- a/files/source/row_neon.cc +++ b/files/source/row_neon.cc @@ -10,8 +10,6 @@ #include "libyuv/row.h" -#include <stdio.h> - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -21,90 +19,118 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) +// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are +// reserved. + +// q0: Y uint16x8_t +// d2: U uint8x8_t +// d3: V uint8x8_t + // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.32 {d2[0]}, [%1]! \n" \ - "vld1.32 {d2[1]}, [%2]! \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.32 {d2[0]}, [%[src_u]]! \n" \ + "vld1.32 {d2[1]}, [%[src_v]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmovl.u8 q1, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 q1, q1, #8 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_u]]! \n" \ + "vmovl.u8 q0, d0 \n" \ + "vld1.8 {d3}, [%[src_v]]! \n" \ + "vsli.u16 q0, q0, #8 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vmov.u8 q1, #128 \n" \ + "vmovl.u8 q0, d0 \n" \ + "vsli.u16 q0, q0, #8 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READNV12 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_uv]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ + "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ // Read 8 Y and 4 VU from NV21 #define READNV21 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_vu]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ + "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ // Read 8 YUY2 #define READYUY2 \ - "vld2.8 {d0, d2}, [%0]! \n" \ + "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ + "vmovl.u8 q0, d0 \n" \ "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" // Read 8 UYVY #define READUYVY \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ + "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ + "vmovl.u8 q0, d3 \n" \ "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" - -#define YUVTORGB_SETUP \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ - "vld1.8 {d25}, [%[kUVToG]] \n" \ - "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ - "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ - "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ - "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" - -#define YUVTORGB \ - "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ - "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ - "vmovl.u8 q0, d0 \n" /* Y */ \ - "vmovl.s16 q10, d1 \n" \ - "vmovl.s16 q0, d0 \n" \ - "vmul.s32 q10, q10, q15 \n" \ - "vmul.s32 q0, q0, q15 \n" \ - "vqshrun.s32 d0, q0, #16 \n" \ - "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ - "vadd.s16 d18, d19 \n" \ - "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ - "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ - "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ - "vaddw.u16 q1, q1, d16 \n" \ - "vaddw.u16 q10, q10, d17 \n" \ - "vaddw.u16 q3, q3, d18 \n" \ - "vqadd.s16 q8, q0, q13 \n" /* B */ \ - "vqadd.s16 q9, q0, q14 \n" /* R */ \ - "vqadd.s16 q0, q0, q4 \n" /* G */ \ - "vqadd.s16 q8, q8, q1 \n" /* B */ \ - "vqadd.s16 q9, q9, q10 \n" /* R */ \ - "vqsub.s16 q0, q0, q3 \n" /* G */ \ - "vqshrun.s16 d20, q8, #6 \n" /* B */ \ - "vqshrun.s16 d22, q9, #6 \n" /* R */ \ - "vqshrun.s16 d21, q0, #6 \n" /* G */ + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" + +#define YUVTORGB_SETUP \ + "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ + "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" + +// q0: B uint16x8_t +// q1: G uint16x8_t +// q2: R uint16x8_t + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "vmull.u16 q2, d1, d31 \n" \ + "vmull.u8 q8, d3, d29 \n" /* DGV */ \ + "vmull.u16 q0, d0, d31 \n" \ + "vmlal.u8 q8, d2, d28 \n" /* DG */ \ + "vqshrn.u32 d0, q0, #16 \n" \ + "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ + "vmull.u8 q9, d2, d26 \n" /* DB */ \ + "vmull.u8 q2, d3, d27 \n" /* DR */ \ + "vadd.u16 q4, q0, q11 \n" /* G */ \ + "vadd.u16 q2, q0, q2 \n" /* R */ \ + "vadd.u16 q0, q0, q9 \n" /* B */ \ + "vqsub.u16 q1, q4, q8 \n" /* G */ \ + "vqsub.u16 q0, q0, q10 \n" /* B */ \ + "vqsub.u16 q2, q2, q12 \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "vqshrn.u16 d4, q2, #6 \n" /* R */ \ + "vqshrn.u16 d2, q1, #6 \n" /* G */ \ + "vqshrn.u16 d0, q0, #6 \n" /* B */ + +#define YUVTORGB_REGS \ + "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" + +#define STORERGBA \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d4 \n" \ + "vmov.u8 d0, d6 \n" \ + "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -114,22 +140,20 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV444 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToARGBRow_NEON(const uint8_t* src_y, @@ -140,22 +164,46 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); +} + +void I444AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -168,22 +216,20 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %5, %5, #8 \n" - "vld1.8 {d23}, [%3]! \n" - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGBARow_NEON(const uint8_t* src_y, @@ -194,22 +240,18 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, @@ -220,29 +262,28 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ - "vshll.u8 q0, d22, #8 \n" /* R */ \ - "vshll.u8 q8, d21, #8 \n" /* G */ \ - "vshll.u8 q9, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #5 \n" /* RG */ \ - "vsri.16 q0, q9, #11 \n" /* RGB */ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q2, q1, #5 \n" /* RG */ \ + "vsri.16 q2, q0, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -252,31 +293,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ - "vshll.u8 q0, d23, #8 \n" /* A */ \ - "vshll.u8 q8, d22, #8 \n" /* R */ \ - "vshll.u8 q9, d21, #8 \n" /* G */ \ - "vshll.u8 q10, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #1 \n" /* AR */ \ - "vsri.16 q0, q9, #6 \n" /* ARG */ \ - "vsri.16 q0, q10, #11 \n" /* ARGB */ + "vshll.u8 q3, d6, #8 \n" /* A */ \ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q3, q2, #1 \n" /* AR */ \ + "vsri.16 q3, q1, #6 \n" /* ARG */ \ + "vsri.16 q3, q0, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -287,30 +326,28 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB1555 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 + "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); } #define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ + "vshr.u8 d0, d0, #4 \n" /* B */ \ + "vbic.32 d2, d2, d7 \n" /* G */ \ + "vshr.u8 d4, d4, #4 \n" /* R */ \ + "vbic.32 d6, d6, d7 \n" /* A */ \ + "vorr d0, d0, d2 \n" /* BG */ \ + "vorr d1, d4, d6 \n" /* RA */ \ "vzip.u8 d0, d1 \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, @@ -321,56 +358,53 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // vbic bits to clear - "1: \n" - - READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + "vmov.u8 d6, #255 \n" + "vmov.u8 d7, #0x0f \n" // vbic bits to clear + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV400 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), - [kUVToG] "r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d23, #255 \n" + "vmov.u8 d23, #255 \n" "1: \n" - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -383,22 +417,20 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, @@ -406,22 +438,20 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, @@ -430,25 +460,19 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - - "1: \n" - - READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, @@ -457,25 +481,19 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - - "1: \n" - - READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, @@ -485,62 +503,56 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. @@ -550,11 +562,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store U - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -564,6 +576,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv, ); } +// Reads 16 byte Y's from tile and writes out 16 Y's. +// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes +// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes +// width measured in bytes so 8 UV = 16. +void DetileRow_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, 1792] \n" + "vst1.16 {q0}, [%1]! \n" // store 16 bytes + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "q0" // Clobber List + ); +} + +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d1}, [%0], %4 \n" + "subs %3, %3, #16 \n" + "pld [%0, 1792] \n" + "vst1.8 {d0}, [%1]! \n" + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "d0", "d1" // Clobber List + ); +} + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, @@ -571,11 +629,11 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load U - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -593,13 +651,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB - "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store R - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%3]! \n" // store B - "bgt 1b \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -618,13 +676,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q2}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB - "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -635,14 +693,341 @@ void MergeRGBRow_NEON(const uint8_t* src_r, ); } +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %5, %5, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "vst1.8 {q3}, [%4]! \n" // store A + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "vld1.8 {q3}, [%3]! \n" // load A + "subs %5, %5, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 q3, #255 \n" // load A(255) + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "vmov.u32 q14, #1023 \n" + "vdup.32 q15, %5 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vshl.u32 q2, q2, q15 \n" // 000B + "vshl.u32 q1, q1, q15 \n" + "vshl.u32 q0, q0, q15 \n" + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "vmov.u32 q14, #1023 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // 000B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + "3: \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q14"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vdup.u16 q15, %6 \n" + "vdup.u16 q14, %7 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vmin.u16 q3, q3, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "subs %5, %5, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%4]! \n" + "vst4.16 {d1, d3, d5, d7}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vmov.u8 q3, #0xff \n" // A (0xffff) + "vdup.u16 q15, %5 \n" + "vdup.u16 q14, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "subs %4, %4, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%3]! \n" + "vst4.16 {d1, d3, d5, d7}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "vqmovn.u16 d2, q2 \n" + "vqmovn.u16 d3, q3 \n" + "subs %5, %5, #8 \n" + "vst4.8 {d0, d1, d2, d3}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %5 \n" + "vmov.u8 d6, #0xff \n" // A (0xff) + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vqmovn.u16 d5, q2 \n" + "vqmovn.u16 d4, q1 \n" + "vqmovn.u16 d3, q0 \n" + "subs %4, %4, #8 \n" + "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -654,11 +1039,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "vdup.8 q0, %2 \n" // duplicate 16 bytes + "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -668,11 +1053,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 @@ -682,41 +1067,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2 \n" - "sub %0, #16 \n" + "add %0, %0, %2 \n" + "sub %0, %0, #32 \n" // 32 bytes per loop "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 + "subs %2, #32 \n" // 32 pixels per loop. + "vrev64.8 q0, q2 \n" + "vrev64.8 q1, q1 \n" + "vswp d0, d1 \n" + "vswp d2, d3 \n" + "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %2, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 : - : "cc", "memory", "r3", "q0"); + : "cc", "memory", "r12", "q0"); } -void MirrorUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile( // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1]! \n" // dst += 8 - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -725,37 +1131,57 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, : "cc", "memory", "r12", "q0"); } -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #32 \n" "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0"); + "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vrev64.8 d3, d3 \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "d0", "d1", "d2", "d3"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + src_rgb24 += width * 3 - 24; + asm volatile( + "1: \n" + "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"(-24) // %3 + : "cc", "memory", "d0", "d1", "d2"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -766,13 +1192,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d4, #255 \n" // Alpha + "vmov.u8 d4, #255 \n" // Alpha "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -781,15 +1207,31 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { ); } +void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "vmov.u8 d0, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -814,13 +1256,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -860,13 +1302,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -889,13 +1331,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // Alpha + "vmov.u8 d3, #255 \n" // Alpha "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -909,27 +1351,28 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of - // RGB24. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels. + "vst3.8 {d1, d3, d5}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -941,10 +1384,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -956,10 +1399,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -974,11 +1417,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d1}, [%1]! \n" // store 8 U. - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -994,11 +1437,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d0}, [%1]! \n" // store 8 U. - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1014,16 +1457,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.8 {d1}, [%2]! \n" // store 8 U. - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 "+r"(dst_u), // %2 @@ -1041,16 +1484,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.8 {d0}, [%2]! \n" // store 8 U. - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 "+r"(dst_u), // %2 @@ -1068,14 +1511,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // shuffler + "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1091,12 +1534,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - "vld1.8 {d1}, [%1]! \n" // load 8 Us - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1113,12 +1556,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - "vld1.8 {d0}, [%1]! \n" // load 8 Us - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1133,16 +1576,16 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "d6"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, @@ -1150,21 +1593,21 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "vdup.32 d2, %2 \n" // dither4 + "vdup.32 d7, %2 \n" // dither4 "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" // add for dither + "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d7 \n" + "vqadd.u8 d2, d2, d7 \n" + "vqadd.u8 d4, d4, d7 \n" // add for dither ARGBTORGB565 - "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. - "bgt 1b \n" + "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, @@ -1172,58 +1615,35 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. - "bgt 1b \n" + "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "vmov.u8 d4, #0x0f \n" // bits to clear with + "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. - "bgt 1b \n" + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); -} - -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, @@ -1231,11 +1651,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 @@ -1244,59 +1664,36 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 + "vmov.u8 d24, #112 \n" // UB / VR 0.875 // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + + "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned + "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned + + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1312,13 +1709,11 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \ + "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. @@ -1328,34 +1723,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1374,34 +1769,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 @@ -1413,40 +1808,132 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +// TODO(fbarchard): Subsample match C code. +void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void RAWToUVJRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q3, q2, q1) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_stride_bgra), // %1 "+r"(dst_u), // %2 @@ -1464,34 +1951,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_stride_abgr), // %1 "+r"(dst_u), // %2 @@ -1509,34 +1996,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_stride_rgba), // %1 "+r"(dst_u), // %2 @@ -1554,34 +2041,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 @@ -1599,34 +2086,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, uint8_t* dst_v, int width) { asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 32 processed per loop. + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 @@ -1645,55 +2132,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_stride_rgb565), // %1 "+r"(dst_u), // %2 @@ -1711,55 +2198,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_stride_argb1555), // %1 "+r"(dst_u), // %2 @@ -1777,55 +2264,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_v, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q0, q4, #1 \n" // 2x average + "vrshr.u16 q1, q5, #1 \n" + "vrshr.u16 q2, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_stride_argb4444), // %1 "+r"(dst_u), // %2 @@ -1838,21 +2316,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1864,21 +2342,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1890,21 +2368,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant + "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1912,119 +2390,276 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { asm volatile( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vmov.u8 q1, q0 \n" + "vmov.u8 q3, q2 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 "+r"(width) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { asm volatile( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 + "vld1.8 {q4}, [%3] \n" // shuffler + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vtbl.8 d2, {d0, d1}, d8 \n" + "vtbl.8 d3, {d0, d1}, d9 \n" + "vtbl.8 d6, {d4, d5}, d8 \n" + "vtbl.8 d7, {d4, d5}, d9 \n" + "vmov.u8 q0, q1 \n" + "vmov.u8 q2, q3 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vshrn.u16 d0, q0, #8 \n" + "vshrn.u16 d1, q1, #8 \n" + "vshrn.u16 d4, q2, #8 \n" + "vshrn.u16 d5, q3, #8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "vld1.8 {d8}, [%3] \n" // shuffler + + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vtbl.8 d0, {d0, d1}, d8 \n" + "vtbl.8 d1, {d2, d3}, d8 \n" + "vtbl.8 d4, {d4, d5}, d8 \n" + "vtbl.8 d5, {d6, d7}, d8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d0, d20 \n" // B + "vmull.u8 q9, d1, d20 \n" + "vmlal.u8 q8, d2, d21 \n" // G + "vmlal.u8 q9, d3, d21 \n" + "vmlal.u8 q8, d4, d22 \n" // R + "vmlal.u8 q9, d5, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" + "1: \n" + "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of + // RGB24. + "vld3.8 {d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); } void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); } void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); } // Bilinear filter 16x2 -> 16x1 @@ -2035,46 +2670,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y1_fraction = source_y_fraction; asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2086,59 +2721,119 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_16_NEON(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "cmp %4, #128 \n" + "beq 50f \n" + + "vdup.16 d17, %4 \n" + "vdup.16 d16, %5 \n" + // General purpose row blend. + "1: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vmull.u16 q2, d0, d16 \n" + "vmull.u16 q3, d1, d16 \n" + "vmlal.u16 q2, d2, d17 \n" + "vmlal.u16 q3, d3, d17 \n" + "vrshrn.u32 d0, q2, #8 \n" + "vrshrn.u32 d1, q3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "vld1.16 {q0}, [%1]! \n" + "vld1.16 {q1}, [%2]! \n" + "subs %3, %3, #8 \n" + "vrhadd.u16 q0, q1 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.16 {q0}, [%1]! \n" + "subs %3, %3, #8 \n" + "vst1.16 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8"); +} + // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( - "subs %3, #8 \n" - "blt 89f \n" + "subs %3, #8 \n" + "blt 89f \n" // Blend 8 pixels. "8: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "adds %3, #8-1 \n" + "blt 99f \n" // Blend 1 pixels. "1: \n" - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" - - : "+r"(src_argb0), // %0 + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2153,16 +2848,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2178,32 +2873,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" "vqdmulh.s16 q0, q0, q8 \n" // b * scale "vqdmulh.s16 q1, q1, q8 \n" // g "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2220,28 +2915,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 "vqrdmulh.s16 q11, q11, d0[1] \n" // g "vqrdmulh.s16 q12, q12, d0[2] \n" // r "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2251,23 +2946,23 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. -// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient + "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient + "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2281,32 +2976,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2322,51 +3017,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2376,27 +3071,27 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb0), // %0 + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2405,21 +3100,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb0), // %0 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2428,21 +3123,21 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb0), // %0 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2460,17 +3155,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2487,12 +3182,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2511,15 +3206,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "vmov.u8 d3, #255 \n" // alpha + "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2539,23 +3234,23 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%5 \n" // top - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%2],%5 \n" // bottom - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2577,23 +3272,23 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "vld1.8 {d0}, [%0],%4 \n" // left - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%0],%5 \n" // right - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2615,18 +3310,18 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2641,18 +3336,18 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2667,17 +3362,17 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 bytes - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q1, d2 \n" // 8 shorts - "vmovl.u16 q2, d2 \n" // 8 ints - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // scale - "vmul.f32 q3, q3, %y3 \n" - "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats - "bgt 1b \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2694,26 +3389,26 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "vmov.u16 d6, #4 \n" // constant 4 - "vmov.u16 d7, #6 \n" // constant 6 - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows - "vld1.16 {q2}, [%4]! \n" - "vaddl.u16 q0, d2, d4 \n" // * 1 - "vaddl.u16 q1, d3, d5 \n" // * 1 - "vld1.16 {q2}, [%1]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "vld1.16 {q2}, [%2]! \n" - "vmlal.u16 q0, d4, d7 \n" // * 6 - "vmlal.u16 q1, d5, d7 \n" // * 6 - "vld1.16 {q2}, [%3]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop - "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples - "bgt 1b \n" + "vmov.u16 d6, #4 \n" // constant 4 + "vmov.u16 d7, #6 \n" // constant 6 + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows + "vld1.16 {q2}, [%4]! \n" + "vaddl.u16 q0, d2, d4 \n" // * 1 + "vaddl.u16 q1, d3, d5 \n" // * 1 + "vld1.16 {q2}, [%1]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "vld1.16 {q2}, [%2]! \n" + "vmlal.u16 q0, d4, d7 \n" // * 6 + "vmlal.u16 q1, d5, d7 \n" // * 6 + "vld1.16 {q2}, [%3]! \n" + "vmlal.u16 q0, d4, d6 \n" // * 4 + "vmlal.u16 q1, d5, d6 \n" // * 4 + "subs %6, %6, #8 \n" // 8 processed per loop + "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples + "bgt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2731,8 +3426,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "vmov.u32 q10, #4 \n" // constant 4 - "vmov.u32 q11, #6 \n" // constant 6 + "vmov.u32 q10, #4 \n" // constant 4 + "vmov.u32 q11, #6 \n" // constant 6 "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples @@ -2769,17 +3464,17 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, uint8_t* dst_yuv24, int width) { asm volatile( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -2793,24 +3488,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_uv, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_uv), // %2 @@ -2824,24 +3519,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, uint8_t* dst_vu, int width) { asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV + "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_vu), // %2 @@ -2854,12 +3549,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2867,16 +3562,16 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { : "cc", "memory", "q0", "q1", "q2", "q3"); } -// Convert biplanar UV channel of NV12 to NV21 -void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values - "vld2.8 {d1, d3}, [%0]! \n" - "vorr.u8 q2, q0, q0 \n" // move U after V - "subs %2, %2, #16 \n" // 16 pixels per loop - "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels - "bgt 1b \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -2884,6 +3579,170 @@ void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { : "cc", "memory", "q0", "q1", "q2"); } +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 U values + "vld1.8 {q1}, [%2]! \n" // load 16 V values + "vld1.8 {q2}, [%1]! \n" + "vld1.8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" // half size + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q1, q3 \n" + "vqrshrn.u16 d0, q0, #2 \n" + "vqrshrn.u16 d1, q1, #2 \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop + "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = depth - 16; // Negative for right shift. + asm volatile( + "vdup.16 q2, %4 \n" + "1: \n" + "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst1.16 {q0}, [%1]! \n" // store 8 U pixels + "vst1.16 {q1}, [%2]! \n" // store 8 V pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "vdup.16 q2, %4 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" // load 8 U + "vld1.16 {q1}, [%1]! \n" // load 8 V + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "q0", "q1", "q2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vmul.u16 q0, q0, q2 \n" + "vmul.u16 q1, q1, q2 \n" + "vst1.16 {q0}, [%1]! \n" + "vst1.16 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q0, %3 \n" + "1: \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vmovl.u16 q3, d2 \n" + "vmovl.u16 q1, d3 \n" + "vmovl.u16 q4, d4 \n" + "vmovl.u16 q2, d5 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vshl.u32 q1, q1, q0 \n" + "vshl.u32 q2, q2, q0 \n" + "vmovn.u32 d2, q3 \n" + "vmovn.u32 d3, q1 \n" + "vmovn.u32 d4, q4 \n" + "vmovn.u32 d5, q2 \n" + "vst1.16 {q1}, [%1]! \n" + "vst1.16 {q2}, [%1]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits = shr 1 +// 16384 = 10 bits = shr 2 +// 4096 = 12 bits = shr 4 +// 256 = 16 bits = shr 8 +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative + "vshl.u16 q1, q1, q2 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(shift) // %3 + : "cc", "memory", "q0", "q1", "q2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc index f5cbb470..0f120373 100644 --- a/files/source/row_neon64.cc +++ b/files/source/row_neon64.cc @@ -15,102 +15,108 @@ namespace libyuv { extern "C" { #endif +// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer +// STn over ZIP1+ST1 +// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. + // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// v0.8h: Y +// v1.16b: 8U, 8V + // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v1.s}[0], [%1], #4 \n" \ - "ld1 {v1.s}[1], [%2], #4 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.s}[0], [%[src_u]], #4 \n" \ + "ld1 {v1.s}[1], [%[src_v]], #4 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v1.16b, v1.16b, v1.16b \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v1.d}[0], [%1], #8 \n" \ - "ld1 {v1.d}[1], [%2], #8 \n" \ - "uaddlp v1.8h, v1.16b \n" \ - "rshrn v1.8b, v1.8h, #1 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.d}[0], [%[src_u]], #8 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "ld1 {v1.d}[1], [%[src_v]], #8 \n" \ + "prfm pldl1keep, [%[src_u], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_v], 448] \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "movi v1.8b , #128 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "movi v1.16b, #128 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" -// Read 8 Y and 4 UV from NV12 +static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, + 1, 1, 3, 3, 5, 5, 7, 7}; +static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, + 0, 0, 2, 2, 4, 4, 6, 6}; + +// Read 8 Y and 4 UV from NV12 or NV21 #define READNV12 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" - -// Read 8 Y and 4 VU from NV21 -#define READNV21 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v3.8b, v2.8b, v2.8b \n" \ - "uzp2 v1.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ldr d1, [%[src_uv]], #8 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" \ + "prfm pldl1keep, [%[src_uv], 448] \n" // Read 8 YUY2 -#define READYUY2 \ - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ - "uzp2 v3.8b, v1.8b, v1.8b \n" \ - "uzp1 v1.8b, v1.8b, v1.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_yuy2], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" // Read 8 UYVY -#define READUYVY \ - "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ - "orr v0.8b, v3.8b, v3.8b \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" - -#define YUVTORGB_SETUP \ - "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" - -#define YUVTORGB(vR, vG, vB) \ - "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ - "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ - "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ - "ushll v0.4s, v0.4h, #0 \n" \ - "mul v3.4s, v3.4s, v31.4s \n" \ - "mul v0.4s, v0.4s, v31.4s \n" \ - "sqshrun v0.4h, v0.4s, #16 \n" \ - "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ - "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ - "uxtl v2.8h, v2.8b \n" \ - "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v1.8h, v27.8h \n" \ - "mul v5.8h, v1.8h, v29.8h \n" \ - "mul v6.8h, v2.8h, v30.8h \n" \ - "mul v7.8h, v2.8h, v28.8h \n" \ - "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB \ - ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG \ - ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR \ - ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB \ - ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG \ - ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR \ - ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB \ - ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG \ - ".8h, #6 \n" /* G */ \ - "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ +#define READUYVY \ + "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ + "zip1 v0.16b, v4.16b, v4.16b \n" \ + "prfm pldl1keep, [%[src_uyvy], 448] \n" \ + "tbl v1.16b, {v3.16b}, v2.16b \n" + +// UB VR UG VG +// YG BB BG BR +#define YUVTORGB_SETUP \ + "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ + "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" + +// v16.8h: B +// v17.8h: G +// v18.8h: R + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "umull2 v3.4s, v0.8h, v24.8h \n" \ + "umull v6.8h, v1.8b, v30.8b \n" \ + "umull v0.4s, v0.4h, v24.4h \n" \ + "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ + "uqshrn v0.4h, v0.4s, #16 \n" \ + "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ + "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ + "add v17.8h, v0.8h, v26.8h \n" /* G */ \ + "add v16.8h, v0.8h, v4.8h \n" /* B */ \ + "add v18.8h, v0.8h, v5.8h \n" /* R */ \ + "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ + "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ + "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "uqshrn v17.8b, v17.8h, #6 \n" \ + "uqshrn v16.8b, v16.8h, #6 \n" \ + "uqshrn v18.8b, v18.8h, #6 \n" + +#define YUVTORGB_REGS \ + "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \ + "v26", "v27", "v28", "v29", "v30", "v31" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -118,27 +124,22 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV444 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToARGBRow_NEON(const uint8_t* src_y, @@ -147,27 +148,48 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I444AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -177,28 +199,23 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "ld1 {v23.8b}, [%3], #8 \n" - "subs %w5, %w5, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToRGBARow_NEON(const uint8_t* src_y, @@ -207,27 +224,22 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v20.8b, #255 \n" /* A */ - "1: \n" - READYUV422 - YUVTORGB(v23, v22, v21) - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v15.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v15"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, @@ -236,34 +248,29 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ - "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v21.8h, #5 \n" /* RG */ \ - "sri v0.8h, v20.8h, #11 \n" /* RGB */ + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v18.8h, v17.8h, #5 \n" /* RG */ \ + "sri v18.8h, v16.8h, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -273,33 +280,28 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ - "shll v0.8h, v23.8b, #8 \n" /* A */ \ - "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v22.8h, #1 \n" /* AR */ \ - "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v20.8h, #11 \n" /* ARGB */ + "shll v0.8h, v19.8b, #8 \n" /* A */ \ + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v0.8h, v18.8h, #1 \n" /* AR */ \ + "sri v0.8h, v17.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v16.8h, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -309,34 +311,31 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + "movi v19.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } #define ARGBTOARGB4444 \ - /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ - "ushr v20.8b, v20.8b, #4 \n" /* B */ \ - "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ - "ushr v22.8b, v22.8b, #4 \n" /* R */ \ - "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ - "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ - "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \ + "ushr v16.8b, v16.8b, #4 \n" /* B */ \ + "bic v17.8b, v17.8b, v23.8b \n" /* G */ \ + "ushr v18.8b, v18.8b, #4 \n" /* R */ \ + "bic v19.8b, v19.8b, v23.8b \n" /* A */ \ + "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \ + "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, @@ -345,95 +344,109 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" - ARGBTOARGB4444 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v23.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" ARGBTOARGB4444 + "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 + // pixels + // ARGB4444. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19", "v23"); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV400 - YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV400 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } +#if LIBYUV_USE_ST4 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "movi v23.8b, #255 \n" + "movi v23.8b, #255 \n" "1: \n" - "ld1 {v20.8b}, [%0], #8 \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" + "ld1 {v20.8b}, [%0], #8 \n" + "prfm pldl1keep, [%0, 448] \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v20", "v21", "v22", "v23"); } +#else +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #255 \n" + "1: \n" + "ldr d16, [%0], #8 \n" + "subs %w2, %w2, #8 \n" + "zip1 v18.16b, v16.16b, v16.16b \n" // YY + "zip1 v19.16b, v16.16b, v20.16b \n" // YA + "prfm pldl1keep, [%0, 448] \n" + "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA + "zip2 v17.16b, v18.16b, v19.16b \n" + "stp q16, q17, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v16", "v17", "v18", "v19", "v20"); +} +#endif // LIBYUV_USE_ST4 void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READNV12 - YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, @@ -441,26 +454,22 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READNV21 - YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, @@ -468,25 +477,21 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV12 - YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, @@ -494,25 +499,21 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV21 - YUVTORGB(v22, v21, v20) - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, @@ -522,72 +523,63 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB( - v22, v21, - v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 + // pixels + // RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUY2 - YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READUYVY - YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. @@ -597,11 +589,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store U - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -611,6 +604,53 @@ void SplitUVRow_NEON(const uint8_t* src_uv, ); } +// Reads 16 byte Y's from tile and writes out 16 Y's. +// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes +// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes +// width measured in bytes so 8 UV = 16. +void DetileRow_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead + "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "v0" // Clobber List + ); +} + +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b,v1.8b}, [%0], %4 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "st1 {v0.8b}, [%1], #8 \n" + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +#if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, @@ -618,11 +658,13 @@ void MergeUVRow_NEON(const uint8_t* src_u, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -632,6 +674,86 @@ void MergeUVRow_NEON(const uint8_t* src_u, ); } +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "dup v2.8h, %w4 \n" + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // load 8 U + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); +} +#else +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "zip1 v2.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip2 v3.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "dup v4.8h, %w4 \n" + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // load 8 U + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "ushl v0.8h, v0.8h, v4.8h \n" + "ushl v1.8h, v1.8h, v4.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v2.8h, v0.8h, v1.8h \n" + "zip2 v3.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4"); +} +#endif // LIBYUV_USE_ST2 + // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_r, @@ -640,12 +762,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, int width) { asm volatile( "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st1 {v0.16b}, [%1], #16 \n" // store R - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%3], #16 \n" // store B - "b.gt 1b \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -664,12 +787,15 @@ void MergeRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -680,14 +806,403 @@ void MergeRGBRow_NEON(const uint8_t* src_r, ); } +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%3], #16 \n" // store B + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%1], #16 \n" // store R + "st1 {v3.16b}, [%4], #16 \n" // store A + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +#if LIBYUV_USE_ST4 +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%2], #16 \n" // load B + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%3, 448] \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#else +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%2], #16 \n" // load B + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%2, 448] \n" + "zip1 v4.16b, v0.16b, v1.16b \n" // BG + "zip1 v5.16b, v2.16b, v3.16b \n" // RA + "prfm pldl1keep, [%1, 448] \n" + "zip2 v6.16b, v0.16b, v1.16b \n" // BG + "zip2 v7.16b, v2.16b, v3.16b \n" // RA + "prfm pldl1keep, [%0, 448] \n" + "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA + "zip2 v1.8h, v4.8h, v5.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "zip1 v2.8h, v6.8h, v7.8h \n" + "zip2 v3.8h, v6.8h, v7.8h \n" + "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} +#endif // LIBYUV_USE_ST4 + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%3], #16 \n" // store B + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%1], #16 \n" // store R + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.16b, #255 \n" // load A(255) + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v0.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "prfm pldl1keep, [%2, 448] \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "dup v31.4s, %w5 \n" + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "ushl v2.4s, v2.4s, v31.4s \n" // 000B + "ushl v1.4s, v1.4s, v31.4s \n" // G + "ushl v0.4s, v0.4s, v31.4s \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // 000B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "v0", "v1", "v2", "v30"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "dup v30.8h, %w7 \n" + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umin v3.8h, v3.8h, v30.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "movi v3.16b, #0xff \n" // A (0xffff) + "dup v30.8h, %w6 \n" + "dup v31.8h, %w5 \n" + + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn v3.8b, v3.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w5 \n" + "movi v3.8b, #0xff \n" // A (0xff) + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" - "ldp q0, q1, [%0], #32 \n" - "subs %w2, %w2, #32 \n" // 32 processed per loop - "stp q0, q1, [%1], #32 \n" - "b.gt 1b \n" + "ldp q0, q1, [%0], #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers @@ -699,11 +1214,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( - "dup v0.16b, %w2 \n" // duplicate 16 bytes + "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 @@ -712,89 +1227,157 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( - "dup v0.4s, %w2 \n" // duplicate 4 ints + "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 : "cc", "memory", "v0"); } +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + "ld1 {v3.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q2, [%0, 16] \n" + "ldr q1, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #32 \n" // 32 pixels per loop. + "tbl v0.16b, {v2.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirror) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void MirrorUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1"); + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorUV) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { +void MirrorSplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile( // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + "ld1 {v4.16b}, [%4] \n" // shuffler + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w3, %w3, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "uzp1 v0.16b, v2.16b, v3.16b \n" // U + "uzp2 v1.16b, v2.16b, v3.16b \n" // V + "st1 {v0.16b}, [%1], #16 \n" // dst += 16 + "st1 {v1.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&kShuffleMirrorUV) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +// Shuffle table for reversing the ARGB. +static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, + 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; + +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #8 \n" // 8 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "ld1 {v3.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #48 \n" + + "1: \n" + "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v2.16b, {v2.16b}, v3.16b \n" + "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-48), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( - "movi v4.8b, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha "1: \n" - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of + // RGB24. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -805,14 +1388,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "movi v5.8b, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -821,15 +1405,35 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { ); } +void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + asm volatile( + "movi v0.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v1.8b, v5.8b, v5.8b \n" // move r + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgba), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 @@ -855,13 +1459,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -911,14 +1515,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // Alpha + "movi v3.8b, #255 \n" // Alpha "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -927,6 +1531,8 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, ); } +// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b +// clobbers v3 #define ARGB4444TOARGB \ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ @@ -944,12 +1550,11 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -963,28 +1568,29 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of - // RGB24. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" + "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "prfm pldl1keep, [%0, 448] \n" + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 @@ -996,10 +1602,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1011,10 +1618,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1029,11 +1637,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1049,11 +1658,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1071,14 +1681,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 @@ -1098,14 +1709,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 @@ -1123,13 +1735,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, const uint8_t* shuffler, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1145,13 +1758,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "orr v2.8b, v1.8b, v1.8b \n" - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "subs %w4, %w4, #16 \n" // 16 pixels + "orr v2.8b, v1.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1168,13 +1782,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1189,16 +1804,17 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 + "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v16", "v17", "v18", "v19"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, @@ -1206,20 +1822,22 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "dup v1.4s, %w2 \n" // dither4 + "dup v1.4s, %w2 \n" // dither4 "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8 + // pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v16.8b, v16.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqadd v17.8b, v17.8b, v1.8b \n" + "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 + "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 - : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, @@ -1227,99 +1845,198 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - // ARGB1555. - "b.gt 1b \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v0", "v16", "v17", "v18", "v19"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "movi v4.16b, #0x0f \n" // bits to clear with + "movi v23.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - // ARGB4444. - "b.gt 1b \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 + // pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +#if LIBYUV_USE_ST2 +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { asm volatile( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "mov v1.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 + "+r"(dst_ar64), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { asm volatile( + "ldr q4, [%3] \n" // shuffler "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 - // pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v1.16b, v0.16b \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} +#else +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "zip1 v2.16b, v0.16b, v0.16b \n" + "zip2 v3.16b, v0.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v4.16b, v1.16b, v1.16b \n" + "zip2 v5.16b, v1.16b, v1.16b \n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 + "+r"(dst_ar64), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} + +static const uvec8 kShuffleARGBToAB64[2] = { + {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}, + {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}}; + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "ldp q6, q7, [%3] \n" // 2 shufflers + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 + "tbl v3.16b, {v0.16b}, v7.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v4.16b, {v1.16b}, v6.16b \n" + "tbl v5.16b, {v1.16b}, v7.16b \n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToAB64[0]) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} +#endif // LIBYUV_USE_ST2 + +static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ldr q4, [%3] \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAR64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, + 21, 19, 17, 23, 29, 27, 25, 31}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ldr q4, [%3] \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { asm volatile( - "movi v4.8b, #15 \n" // B * 0.11400 coefficient - "movi v5.8b, #75 \n" // G * 0.58700 coefficient - "movi v6.8b, #38 \n" // R * 0.29900 coefficient "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 + "+r"(dst_a), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } // 8x1 pixels. @@ -1328,33 +2045,31 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width) { asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 + "movi v24.8b, #112 \n" // UB / VR 0.875 // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + + "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned + "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned + + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1381,10 +2096,8 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ + "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. @@ -1398,26 +2111,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1429,7 +2144,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match C code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1437,31 +2151,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 @@ -1473,6 +2189,96 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_rgb24_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + +void RAWToUVJRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_raw_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1481,25 +2287,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 @@ -1519,25 +2327,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 @@ -1557,25 +2367,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 @@ -1595,25 +2407,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 @@ -1633,25 +2447,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 @@ -1663,7 +2479,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, ); } -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, @@ -1671,67 +2487,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int width) { const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( - "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / - // 2 - "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 - "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 - "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 - "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v17.D[0] \n" - "ins v18.D[1], v19.D[0] \n" - "ins v20.D[1], v21.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v18.8h, #1 \n" - "urshr v6.8h, v20.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v16.8h, v4.8h, v22.8h \n" // B - "mls v16.8h, v5.8h, v23.8h \n" // G - "mls v16.8h, v6.8h, v24.8h \n" // R - "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned - "mul v17.8h, v6.8h, v22.8h \n" // R - "mls v17.8h, v5.8h, v26.8h \n" // G - "mls v17.8h, v4.8h, v25.8h \n" // B - "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. @@ -1744,50 +2547,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, asm volatile( RGBTOUV_SETUP_REG "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 @@ -1807,52 +2603,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int width) { const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( - RGBTOUV_SETUP_REG + RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v0.8h, v1.8h, v2.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 @@ -1868,21 +2657,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1895,21 +2685,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant + "movi v4.8b, #25 \n" // B * 0.1016 coefficient + "movi v5.8b, #129 \n" // G * 0.5078 coefficient + "movi v6.8b, #66 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1921,21 +2712,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant + "movi v24.8b, #25 \n" // B * 0.1016 coefficient + "movi v25.8b, #129 \n" // G * 0.5078 coefficient + "movi v26.8b, #66 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1943,119 +2735,175 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" + "1: \n" + "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" + "1: \n" + "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : + "ldr d0, [%3] \n" // load rgbconstants + "dup v5.16b, v0.b[0] \n" + "dup v6.16b, v0.b[1] \n" + "dup v7.16b, v0.b[2] \n" + "dup v16.8h, v0.h[2] \n" + "1: \n" + "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v5.8b \n" // B + "umull2 v1.8h, v2.16b, v5.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "umlal v0.8h, v3.8b, v6.8b \n" // G + "umlal2 v1.8h, v3.16b, v6.16b \n" + "umlal v0.8h, v4.8b, v7.8b \n" // R + "umlal2 v1.8h, v4.16b, v7.16b \n" + "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v16.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} + void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); } void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); } // Bilinear filter 16x2 -> 16x1 @@ -2068,44 +2916,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -2118,66 +2971,215 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); } +// Bilinear filter 8x2 -> 8x1 +void InterpolateRow_16_NEON(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.8h}, [%0], #16 \n" + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} + +// Bilinear filter 8x2 -> 8x1 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + + asm volatile( + "dup v6.8h, %w6 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ldr q0, [%1], #16 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "prfm pldl1keep, [%1, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "str d0, [%0], #8 \n" // store 8 pixels + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction), // %5 + "r"(shift) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" // Blend 8 pixels. "8: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 - // pixels - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - // pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels - "b.ge 8b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" // Blend 1 pixels. "1: \n" - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel + // ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel + // ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2193,17 +3195,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, asm volatile( // Attenuate 8 pixels. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "prfm pldl1keep, [%0, 448] \n" + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2219,32 +3221,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_offset, int width) { asm volatile( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 @@ -2261,28 +3264,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, int width, uint32_t value) { asm volatile( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2292,23 +3296,24 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. -// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movi v24.8b, #15 \n" // B * 0.11400 coefficient - "movi v25.8b, #75 \n" // G * 0.58700 coefficient - "movi v26.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v24.8b, #29 \n" // B * 0.1140 coefficient + "movi v25.8b, #150 \n" // G * 0.5870 coefficient + "movi v26.8b, #77 \n" // R * 0.2990 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2323,32 +3328,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "prfm pldl1keep, [%0, 448] \n" + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : @@ -2364,51 +3370,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, const int8_t* matrix_argb, int width) { asm volatile( - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "prfm pldl1keep, [%0, 448] \n" + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2419,27 +3426,29 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb0), // %0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "prfm pldl1keep, [%0, 448] \n" + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "prfm pldl1keep, [%1, 448] \n" + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2448,23 +3457,25 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb0), // %0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2473,23 +3484,25 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb0), // %0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2507,17 +3520,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "prfm pldl1keep, [%0, 448] \n" + "orr v1.8b, v0.8b, v0.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2534,12 +3549,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, asm volatile( // 16 pixel loop. "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "prfm pldl1keep, [%1, 448] \n" + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -2558,15 +3575,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.8b, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "prfm pldl1keep, [%1, 448] \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -2586,23 +3605,26 @@ void SobelXRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%5 \n" // top - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%2],%5 \n" // bottom - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "prfm pldl1keep, [%2, 448] \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -2624,23 +3646,25 @@ void SobelYRow_NEON(const uint8_t* src_y0, int width) { asm volatile( "1: \n" - "ld1 {v0.8b}, [%0],%4 \n" // left - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%0],%5 \n" // right - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "add v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 @@ -2658,16 +3682,17 @@ void HalfFloat1Row_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2681,18 +3706,19 @@ void HalfFloatRow_NEON(const uint16_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2706,17 +3732,18 @@ void ByteToFloatRow_NEON(const uint8_t* src, int width) { asm volatile( "1: \n" - "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v1.8h, v1.8b \n" // 8 shorts - "uxtl v2.4s, v1.4h \n" // 8 ints - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "fmul v3.4s, v3.4s, %3.s[0] \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "prfm pldl1keep, [%0, 448] \n" + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2730,20 +3757,21 @@ float ScaleMaxSamples_NEON(const float* src, int width) { float fmax; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" // scale - "fmax v5.4s, v5.4s, v1.4s \n" // max - "fmax v6.4s, v6.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "fmax v5.4s, v5.4s, v6.4s \n" // max - "fmaxv %s3, v5.4s \n" // signed max acculator + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2759,21 +3787,22 @@ float ScaleSumSamples_NEON(const float* src, int width) { float fsum; asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" // max - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "fmul v4.4s, v2.4s, %4.s[0] \n" - "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares - "fmla v6.4s, v2.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "faddp v5.4s, v5.4s, v6.4s \n" - "faddp v5.4s, v5.4s, v5.4s \n" - "faddp %3.4s, v5.4s, v5.4s \n" // sum + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2786,12 +3815,13 @@ float ScaleSumSamples_NEON(const float* src, void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %3.s[0] \n" // scale - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2808,26 +3838,31 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width) { asm volatile( - "movi v6.8h, #4 \n" // constant 4 - "movi v7.8h, #6 \n" // constant 6 - - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows - "ld1 {v2.8h}, [%4], #16 \n" - "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 - "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 - "ld1 {v2.8h}, [%1], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "ld1 {v2.8h}, [%2], #16 \n" - "umlal v0.4s, v2.4h, v7.4h \n" // * 6 - "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 - "ld1 {v2.8h}, [%3], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples - "b.gt 1b \n" + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "prfm pldl1keep, [%0, 448] \n" + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%1, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "prfm pldl1keep, [%2, 448] \n" + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "prfm pldl1keep, [%3, 448] \n" + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "prfm pldl1keep, [%4, 448] \n" + "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2845,27 +3880,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "add v1.4s, v1.4s, v2.4s \n" // * 1 - "ld1 {v2.4s,v3.4s}, [%2], #32 \n" - "mla v0.4s, v2.4s, v7.4s \n" // * 6 - "mla v1.4s, v3.4s, v7.4s \n" // * 6 - "ld1 {v2.4s,v3.4s}, [%1], #32 \n" - "ld1 {v4.4s,v5.4s}, [%3], #32 \n" - "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 - "add v3.4s, v3.4s, v5.4s \n" - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - "uqrshrn2 v0.8h, v1.4s, #8 \n" - "st1 {v0.8h}, [%4], #16 \n" // store 8 samples - "b.gt 1b \n" + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2876,20 +3912,104 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } +static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + asm volatile( + "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 + + "1: \n" + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows + "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%2], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "prfm pldl1keep, [%1, 448] \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%4], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%2, 448] \n" + "fadd v0.4s, v0.4s, v4.4s \n" // * 1 + "prfm pldl1keep, [%3, 448] \n" + "fadd v1.4s, v1.4s, v5.4s \n" + "prfm pldl1keep, [%4, 448] \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : "r"(&kGaussCoefficients) // %7 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_F32_NEON(const float* src, float* dst, int width) { + asm volatile( + "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 + + "1: \n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 + // rows + "fadd v0.4s, v0.4s, v1.4s \n" // * 1 + "ld1 {v4.4s, v5.4s}, [%0], %5 \n" + "fadd v1.4s, v1.4s, v2.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%0], %4 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "ld1 {v4.4s, v5.4s}, [%0], %6 \n" + "fadd v2.4s, v2.4s, v4.4s \n" + "fadd v3.4s, v3.4s, v5.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "fmla v1.4s, v3.4s, v6.4s \n" + "prfm pldl1keep, [%0, 448] \n" + "fmul v0.4s, v0.4s, v8.4s \n" // / 256 + "fmul v1.4s, v1.4s, v8.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGaussCoefficients), // %3 + "r"(8LL), // %4 + "r"(-4LL), // %5 + "r"(20LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + +#if LIBYUV_USE_ST3 // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "prfm pldl1keep, [%0, 448] \n" + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "prfm pldl1keep, [%1, 448] \n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -2897,7 +4017,44 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, : : "cc", "memory", "v0", "v1", "v2"); } +#else +static const uvec8 kYUV24Shuffle[3] = { + {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20}, + {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27}, + {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}}; + +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values + "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 + "prfm pldl1keep, [%0, 448] \n" + "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} +#endif // LIBYUV_USE_ST3 + +// Note ST2 8b version is faster than zip+ST1 +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, @@ -2905,19 +4062,20 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels. - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "b.gt 1b \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_uv), // %2 @@ -2933,19 +4091,20 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels. - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_vu), // %2 @@ -2957,12 +4116,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "prfm pldl1keep, [%0, 448] \n" + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2970,61 +4129,170 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3"); } -void FloatDivToByteRow_NEON(const float* src_weights, - const float* src_values, - uint8_t* dst_out, - uint8_t* dst_mask, - int width) { - asm volatile( - "movi v0.4s, #0 \n" +// Shuffle table for swapping UV bytes. +static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; +// Convert UV plane of NV12 to VU of NV21. +void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights - "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values - "subs %w4, %w4, #8 \n" // 8 pixels per loop - - "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights - "fdiv v2.4s, v4.4s, v2.4s \n" - - "fcvtas v1.4s, v1.4s \n" // float to int - "fcvtas v2.4s, v2.4s \n" // float to int - "uqxtn v1.4h, v1.4s \n" // 8 shorts - "uqxtn2 v1.8h, v2.4s \n" - "uqxtn v1.8b, v1.8h \n" // 8 bytes + "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values + "ld1 {v1.16b}, [%0], 16 \n" + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "tbl v0.16b, {v0.16b}, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v1.16b, {v1.16b}, v2.16b \n" + "stp q0, q1, [%1], 32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "r"(&kShuffleSwapUV) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} - "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values + "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values + "ld1 {v2.16b}, [%1], #16 \n" + "ld1 {v3.16b}, [%3], #16 \n" + "uaddlp v0.8h, v0.16b \n" // half size + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v3.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop + "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} - "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero - "fcmgt v6.4s, v2.4s, v0.4s \n" - "uqxtn v5.4h, v5.4s \n" // 8 shorts - "uqxtn2 v5.8h, v6.4s \n" - "uqxtn v5.8b, v1.8h \n" // 8 bytes +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = depth - 16; // Negative for right shift. + asm volatile( + "dup v2.8h, %w4 \n" + "1: \n" + "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); +} - "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v2.8h, %w3 \n" + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "mul v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "mul v1.8h, v1.8h, v2.8h \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} - "b.gt 1b \n" - : "+r"(src_weights), // %0 - "+r"(src_values), // %1 - "+r"(dst_out), // %2 - "+r"(dst_mask), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v0.8h, %w3 \n" + "1: \n" + "ldp q1, q2, [%0], #32 \n" + "ushll v3.4s, v1.4h, #0 \n" + "ushll v4.4s, v2.4h, #0 \n" + "prfm pldl1keep, [%0, 448] \n" + "ushll2 v1.4s, v1.8h, #0 \n" + "ushll2 v2.4s, v2.8h, #0 \n" + "mul v3.4s, v0.4s, v3.4s \n" + "mul v4.4s, v0.4s, v4.4s \n" + "mul v1.4s, v0.4s, v1.4s \n" + "mul v2.4s, v0.4s, v2.4s \n" + "shrn v3.4h, v3.4s, #16 \n" + "shrn v4.4h, v4.4s, #16 \n" + "shrn2 v3.8h, v1.4s, #16 \n" + "shrn2 v4.8h, v2.4s, #16 \n" + "stp q3, q3, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } -// Convert biplanar UV channel of NV12 to NV21 -void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits = shr 1 +// 16384 = 10 bits = shr 2 +// 4096 = 12 bits = shr 4 +// 256 = 16 bits = shr 8 +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr asm volatile( - "1: \n" - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values - "orr v2.16b, v0.16b, v0.16b \n" // move U after V - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : + "dup v2.8h, %w3 \n" + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "ushl v1.8h, v1.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn2 v0.16b, v1.8h \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "str q0, [%1], #16 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(shift) // %3 : "cc", "memory", "v0", "v1", "v2"); } diff --git a/files/source/row_win.cc b/files/source/row_win.cc index 27e3da7b..c7c1ff60 100644 --- a/files/source/row_win.cc +++ b/files/source/row_win.cc @@ -10,9 +10,9 @@ #include "libyuv/row.h" -// This module is for Visual C 32/64 bit and clangcl 32 bit +// This module is for Visual C 32/64 bit #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) + !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) #if defined(_M_X64) #include <emmintrin.h> @@ -27,12 +27,34 @@ extern "C" { // 64 bit #if defined(_M_X64) +// Read 8 UV from 444 +#define READYUV444 \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + u_buf += 8; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; + +// Read 8 UV from 444, With 8 Alpha. +#define READYUVA444 \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + u_buf += 8; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; + // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -40,10 +62,10 @@ extern "C" { // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -52,24 +74,21 @@ extern "C" { a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm2 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ - xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ - xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm0 = _mm_adds_epi16(xmm0, xmm4); \ - xmm1 = _mm_adds_epi16(xmm1, xmm4); \ - xmm2 = _mm_adds_epi16(xmm2, xmm4); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ +#define YUVTORGB(yuvconstants) \ + xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ + xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ + xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ + xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ + xmm0 = _mm_adds_epi16(xmm4, xmm0); \ + xmm1 = _mm_subs_epi16(xmm4, xmm1); \ + xmm2 = _mm_adds_epi16(xmm4, xmm2); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. @@ -90,7 +109,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { @@ -110,7 +129,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4, xmm5; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 @@ -121,6 +140,44 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, } #endif +#if defined(HAS_I444TOARGBROW_SSSE3) +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + const __m128i xmm5 = _mm_set1_epi8(-1); + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUV444 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + +#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; + while (width > 0) { + READYUVA444 + YUVTORGB(yuvconstants) + STOREARGB + width -= 8; + } +} +#endif + // 32 bit #else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -187,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, // 7 bit fixed point 0.5. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +// 8 bit fixed point 0.5, for bias of UV. +static const ulvec8 kBiasUV128 = { + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { @@ -1367,7 +1424,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, } } -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1380,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1439,7 +1496,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1452,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUVJ128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1513,7 +1570,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1526,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1581,7 +1638,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1594,9 +1651,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToV - vbroadcastf128 ymm7, xmmword ptr kARGBToU + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 + vbroadcastf128 ymm6, xmmword ptr kARGBToVJ + vbroadcastf128 ymm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v convertloop: @@ -1649,7 +1706,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { @@ -1659,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1707,7 +1764,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1720,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU sub edi, edx // stride from u to v @@ -1779,7 +1836,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1792,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU sub edi, edx // stride from u to v @@ -1851,7 +1908,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1864,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU sub edi, edx // stride from u to v @@ -1926,137 +1983,153 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, // Read 16 UV from 444 #define READYUV444_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} +// Read 16 UV from 444. With 16 Alpha. +#define READYUVA444_AVX2 \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16]} + // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ - __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ - __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. #define READNV21_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm0, [eax] /* UV */ \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm0, [eax] /* UV */ \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ - __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ - __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ - __asm vpsubw ymm2, ymm3, ymm2 \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ - __asm vpsubw ymm1, ymm3, ymm1 \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm { \ + __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ + __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ + __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ + __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ + __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ + __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ + __asm vpaddw ymm4, ymm3, ymm4 \ + __asm vpaddsw ymm0, ymm0, ymm4 \ + __asm vpsubsw ymm1, ymm4, ymm1 \ + __asm vpaddsw ymm2, ymm2, ymm4 \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ - } + __asm vpackuswb ymm0, ymm0, ymm0 \ + __asm vpackuswb ymm1, ymm1, ymm1 \ + __asm vpackuswb ymm2, ymm2, ymm2} // Store 16 ARGB values. #define STOREARGB_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ __asm lea edx, [edx + 64]} // Store 16 RGBA values. #define STORERGBA_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ __asm lea edx, [edx + 64]} @@ -2183,6 +2256,48 @@ __declspec(naked) void I444ToARGBRow_AVX2( } #endif // HAS_I444TOARGBROW_AVX2 +#ifdef HAS_I444ALPHATOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) void I444AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + convertloop: + READYUVA444_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444AlphaTOARGBROW_AVX2 + #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). @@ -2361,191 +2476,202 @@ __declspec(naked) void I422ToRGBARow_AVX2( // Read 8 UV from 444. #define READYUV444 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} +// Read 4 UV from 444. With 8 Alpha. +#define READYUVA444 \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm movq xmm4, qword ptr [eax] \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} + // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ - __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. #define READNV12 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. #define READNV21 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ + __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. #define READYUY2 \ - __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm0, [eax] /* UV */ \ - __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. #define READUYVY \ - __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm0, [eax] /* UV */ \ - __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) \ - __asm { \ - __asm movdqa xmm1, xmm0 \ - __asm movdqa xmm2, xmm0 \ - __asm movdqa xmm3, xmm0 \ - __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ - __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ - __asm psubw xmm0, xmm1 \ - __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ - __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ - __asm psubw xmm1, xmm2 \ - __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ - __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ - __asm psubw xmm2, xmm3 \ + __asm { \ + __asm psubb xmm3, xmmword ptr kBiasUV128 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm paddsw xmm0, xmm4 /* B += Y */ \ - __asm paddsw xmm1, xmm4 /* G += Y */ \ - __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ + __asm pmaddubsw xmm0, xmm3 \ + __asm pmaddubsw xmm1, xmm3 \ + __asm pmaddubsw xmm2, xmm3 \ + __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ + __asm paddw xmm4, xmm3 \ + __asm paddsw xmm0, xmm4 \ + __asm paddsw xmm2, xmm4 \ + __asm psubsw xmm4, xmm1 \ + __asm movdqa xmm1, xmm4 \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. #define STOREARGB \ - __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32]} // Store 8 BGRA values. #define STOREBGRA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGBA values. #define STORERGBA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGB24 values. #define STORERGB24 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24]} // Store 8 RGB565 values. #define STORERGB565 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm lea edx, [edx + 16]} // 8 pixels. @@ -2586,6 +2712,46 @@ __declspec(naked) void I444ToARGBRow_SSSE3( } // 8 pixels. +// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). +__declspec(naked) void I444AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA444 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). __declspec(naked) void I422ToRGB24Row_SSSE3( const uint8_t* y_buf, @@ -2898,10 +3064,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3( } #endif // HAS_I422TOARGBROW_SSSE3 +// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -2949,6 +3117,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, // note: vpunpcklbw mutates and vpackuswb unmutates. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -3045,15 +3214,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src, } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORUVROW_SSSE3 +#ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi mov eax, [esp + 4 + 4] // src @@ -3078,7 +3247,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, ret } } -#endif // HAS_MIRRORUVROW_SSSE3 +#endif // HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, @@ -4172,13 +4341,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4267,7 +4436,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0xff000000 @@ -4312,7 +4481,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4406,7 +4575,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4762,20 +4931,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 @@ -4783,8 +4952,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, punpckhbw xmm1, xmm1 // next 2 punpcklbw xmm2, xmm5 // first 2 punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4802,13 +4971,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4817,11 +4986,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4832,11 +5001,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb lea eax, [eax + 4] movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4851,23 +5020,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4881,20 +5050,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] @@ -4902,8 +5071,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4919,19 +5088,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] @@ -4949,21 +5118,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5450,7 +5619,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, // 1 pixel loop l1: - movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes. + movd xmm2, dword ptr [eax] // 1 argb pixel lea eax, [eax + 4] punpcklbw xmm2, xmm1 punpcklwd xmm2, xmm1 diff --git a/files/source/scale.cc b/files/source/scale.cc index ab085496..e1335f1e 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -17,6 +17,7 @@ #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" // For UVScale #ifdef __cplusplus namespace libyuv { @@ -28,6 +29,7 @@ static __inline int Abs(int v) { } #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Scale plane, 1/2 // This is an optimized version for scaling down a plane to 1/2 of @@ -49,7 +51,7 @@ static void ScalePlaneDown2(int src_width, ? ScaleRowDown2_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); - int row_stride = src_stride << 1; + int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { @@ -118,18 +120,18 @@ static void ScalePlaneDown2(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { +#if defined(HAS_SCALEROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { ScaleRowDown2 = filtering == kFilterNone - ? ScaleRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI - : ScaleRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI + ? ScaleRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX + : ScaleRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX : (filtering == kFilterLinear - ? ScaleRowDown2Linear_MMI - : ScaleRowDown2Box_MMI); + ? ScaleRowDown2Linear_LSX + : ScaleRowDown2Box_LSX); } } #endif @@ -161,7 +163,7 @@ static void ScalePlaneDown2_16(int src_width, ? ScaleRowDown2_16_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : ScaleRowDown2Box_16_C); - int row_stride = src_stride << 1; + int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { @@ -184,14 +186,6 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SSE2); } #endif -#if defined(HAS_SCALEROWDOWN2_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_16_MMI - : ScaleRowDown2Box_16_MMI); - } -#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -221,7 +215,7 @@ static void ScalePlaneDown4(int src_width, void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; - int row_stride = src_stride << 2; + int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { @@ -264,12 +258,12 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_MMI) - if (TestCpuFlag(kCpuHasMMI)) { +#if defined(HAS_SCALEROWDOWN4_LSX) + if (TestCpuFlag(kCpuHasLSX)) { ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; + filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX; } } #endif @@ -297,7 +291,7 @@ static void ScalePlaneDown4_16(int src_width, void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; - int row_stride = src_stride << 2; + int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { @@ -316,11 +310,6 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif -#if defined(HAS_SCALEROWDOWN4_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; - } -#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -398,6 +387,26 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_Any_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_LSX; + ScaleRowDown34_1 = ScaleRowDown34_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -613,6 +622,26 @@ static void ScalePlaneDown38(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN38_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_Any_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_LSX; + ScaleRowDown38_2 = ScaleRowDown38_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX; + } + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -886,11 +915,11 @@ static void ScalePlaneBox(int src_width, } } #endif -#if defined(HAS_SCALEADDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleAddRow = ScaleAddRow_Any_MMI; - if (IS_ALIGNED(src_width, 8)) { - ScaleAddRow = ScaleAddRow_MMI; +#if defined(HAS_SCALEADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleAddRow = ScaleAddRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_LSX; } } #endif @@ -898,7 +927,7 @@ static void ScalePlaneBox(int src_width, for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint8_t* src = src_ptr + iy * src_stride; + const uint8_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; @@ -949,15 +978,10 @@ static void ScalePlaneBox_16(int src_width, } #endif -#if defined(HAS_SCALEADDROW_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { - ScaleAddRow = ScaleAddRow_16_MMI; - } -#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint16_t* src = src_ptr + iy * src_stride; + const uint16_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; @@ -1038,11 +1062,11 @@ void ScalePlaneBilinearDown(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -1068,13 +1092,21 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -1123,7 +1155,7 @@ void ScalePlaneBilinearDown_16(int src_width, #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; + InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } @@ -1131,7 +1163,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; + InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } @@ -1139,7 +1171,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; + InterpolateRow = InterpolateRow_16_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } @@ -1147,7 +1179,7 @@ void ScalePlaneBilinearDown_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; + InterpolateRow = InterpolateRow_16_Any_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } @@ -1165,7 +1197,7 @@ void ScalePlaneBilinearDown_16(int src_width, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -1258,6 +1290,14 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) @@ -1265,11 +1305,6 @@ void ScalePlaneBilinearUp(int src_width, ScaleFilterCols = ScaleColsUp2_SSE2; } #endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_MMI; - } -#endif } if (y > max_y) { @@ -1277,7 +1312,7 @@ void ScalePlaneBilinearUp(int src_width, } { int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; @@ -1292,7 +1327,9 @@ void ScalePlaneBilinearUp(int src_width, src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; + if (src_height > 2) { + src += src_stride; + } for (j = 0; j < dst_height; ++j) { yi = y >> 16; @@ -1300,14 +1337,16 @@ void ScalePlaneBilinearUp(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_ptr + yi * src_stride; + src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; - src += src_stride; + if ((y + 65536) < max_y) { + src += src_stride; + } } } if (filtering == kFilterLinear) { @@ -1323,6 +1362,327 @@ void ScalePlaneBilinearUp(int src_width, } } +// Scale plane, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of I422 to I444. +void ScalePlaneUp2_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = + ScaleRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of I420 to I444. +void ScalePlaneUp2_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale at most 14 bit plane, horizontally up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I210 to I410 and I212 to I412. +void ScalePlaneUp2_12_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale at most 12 bit plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +void ScalePlaneUp2_16_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + void ScalePlaneBilinearUp_16(int src_width, int src_height, int dst_width, @@ -1351,7 +1711,7 @@ void ScalePlaneBilinearUp_16(int src_width, #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; + InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } @@ -1359,7 +1719,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; + InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } @@ -1367,7 +1727,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; + InterpolateRow = InterpolateRow_16_Any_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } @@ -1375,7 +1735,7 @@ void ScalePlaneBilinearUp_16(int src_width, #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; + InterpolateRow = InterpolateRow_16_Any_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } @@ -1397,19 +1757,13 @@ void ScalePlaneBilinearUp_16(int src_width, ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_MMI; - } -#endif } - if (y > max_y) { y = max_y; } { int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; @@ -1424,7 +1778,9 @@ void ScalePlaneBilinearUp_16(int src_width, src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; + if (src_height > 2) { + src += src_stride; + } for (j = 0; j < dst_height; ++j) { yi = y >> 16; @@ -1432,14 +1788,16 @@ void ScalePlaneBilinearUp_16(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_ptr + yi * src_stride; + src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; - src += src_stride; + if ((y + 65536) < max_y) { + src += src_stride; + } } } if (filtering == kFilterLinear) { @@ -1487,15 +1845,11 @@ static void ScalePlaneSimple(int src_width, ScaleCols = ScaleColsUp2_SSE2; } #endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_MMI; - } -#endif } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } @@ -1528,15 +1882,11 @@ static void ScalePlaneSimple_16(int src_width, ScaleCols = ScaleColsUp2_16_SSE2; } #endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_MMI; - } -#endif } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); dst_ptr += dst_stride; y += dy; } @@ -1544,7 +1894,6 @@ static void ScalePlaneSimple_16(int src_width, // Scale a plane. // This function dispatches to a specialized scaler based on scale factor. - LIBYUV_API void ScalePlane(const uint8_t* src, int src_stride, @@ -1562,10 +1911,9 @@ void ScalePlane(const uint8_t* src, // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * src_stride; + src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } - // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { @@ -1574,10 +1922,19 @@ void ScalePlane(const uint8_t* src, return; } if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { @@ -1614,6 +1971,17 @@ void ScalePlane(const uint8_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1645,10 +2013,9 @@ void ScalePlane_16(const uint16_t* src, // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * src_stride; + src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } - // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { @@ -1657,10 +2024,22 @@ void ScalePlane_16(const uint16_t* src, return; } if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled vertically. + int dy = 0; + int y = 0; + // When scaling down, use the center 2 rows to filter. + // When scaling up, last row of destination uses the last 2 source rows. + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. + // When scaling up, ensure the last row of destination uses the last + // source. Avoid divide by zero for dst_height but will do no scaling + // later. + } else if (src_height > 1 && dst_height > 1) { + dy = FixedDiv1(src_height, dst_height); + } + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { @@ -1697,6 +2076,17 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1711,6 +2101,43 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); } +LIBYUV_API +void ScalePlane_12(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + + ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, + dst_width, dst_height, filtering); +} + // Scale an I420 image. // This function in turn calls a scaling function for each plane. @@ -1736,7 +2163,8 @@ int I420Scale(const uint8_t* src_y, int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1773,7 +2201,8 @@ int I420Scale_16(const uint16_t* src_y, int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1788,6 +2217,44 @@ int I420Scale_16(const uint16_t* src_y, return 0; } +LIBYUV_API +int I420Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + // Scale an I444 image. // This function in turn calls a scaling function for each plane. @@ -1809,7 +2276,7 @@ int I444Scale(const uint8_t* src_y, int dst_width, int dst_height, enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1842,7 +2309,7 @@ int I444Scale_16(const uint16_t* src_y, int dst_width, int dst_height, enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; @@ -1857,6 +2324,185 @@ int I444Scale_16(const uint16_t* src_y, return 0; } +LIBYUV_API +int I444Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, + dst_width, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, + dst_width, dst_height, filtering); + return 0; +} + +// Scale an I422 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I422Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +LIBYUV_API +int I422Scale_12(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u, + dst_stride_u, dst_halfwidth, dst_height, filtering); + ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v, + dst_stride_v, dst_halfwidth, dst_height, filtering); + return 0; +} + +// Scale an NV12 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int NV12Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + + if (!src_y || !src_uv || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || + dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, + dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); + return 0; +} + // Deprecated api LIBYUV_API int Scale(const uint8_t* src_y, diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc index 17831372..317041f8 100644 --- a/files/source/scale_any.cc +++ b/files/source/scale_any.cc @@ -20,49 +20,6 @@ namespace libyuv { extern "C" { #endif -// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ - int dx) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ - } - -#ifdef HAS_SCALEFILTERCOLS_NEON -CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) -#endif -#ifdef HAS_SCALEFILTERCOLS_MSA -CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) -#endif -#ifdef HAS_SCALEARGBCOLS_NEON -CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) -#endif -#ifdef HAS_SCALEARGBCOLS_MSA -CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) -#endif -#ifdef HAS_SCALEARGBCOLS_MMI -CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, - ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_MSA -CANY(ScaleARGBFilterCols_Any_MSA, - ScaleARGBFilterCols_MSA, - ScaleARGBFilterCols_C, - 4, - 7) -#endif -#undef CANY - // Fixed scale down. // Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ @@ -113,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +SDANY(ScaleUVRowDown2Box_Any_SSSE3, + ScaleUVRowDown2Box_SSSE3, + ScaleUVRowDown2Box_C, + 2, + 2, + 3) +#endif +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +SDANY(ScaleUVRowDown2Box_Any_AVX2, + ScaleUVRowDown2Box_AVX2, + ScaleUVRowDown2Box_C, + 2, + 2, + 7) +#endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_AVX2, @@ -155,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_NEON +SDANY(ScaleUVRowDown2Box_Any_NEON, + ScaleUVRowDown2Box_NEON, + ScaleUVRowDown2Box_C, + 2, + 2, + 7) +#endif + #ifdef HAS_SCALEROWDOWN2_MSA SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_MSA, @@ -170,26 +152,20 @@ SDANY(ScaleRowDown2Box_Any_MSA, 1, 31) #endif -#ifdef HAS_SCALEROWDOWN2_MMI -SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7) -SDANY(ScaleRowDown2Linear_Any_MMI, - ScaleRowDown2Linear_MMI, +#ifdef HAS_SCALEROWDOWN2_LSX +SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_LSX, + ScaleRowDown2Linear_LSX, ScaleRowDown2Linear_C, 2, 1, - 7) -SDANY(ScaleRowDown2Box_Any_MMI, - ScaleRowDown2Box_MMI, + 31) +SDANY(ScaleRowDown2Box_Any_LSX, + ScaleRowDown2Box_LSX, ScaleRowDown2Box_C, 2, 1, - 7) -SDODD(ScaleRowDown2Box_Odd_MMI, - ScaleRowDown2Box_MMI, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 7) + 31) #endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) @@ -227,14 +203,14 @@ SDANY(ScaleRowDown4Box_Any_MSA, 1, 15) #endif -#ifdef HAS_SCALEROWDOWN4_MMI -SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_MMI, - ScaleRowDown4Box_MMI, +#ifdef HAS_SCALEROWDOWN4_LSX +SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_LSX, + ScaleRowDown4Box_LSX, ScaleRowDown4Box_C, 4, 1, - 7) + 15) #endif #ifdef HAS_SCALEROWDOWN34_SSSE3 SDANY(ScaleRowDown34_Any_SSSE3, @@ -296,6 +272,26 @@ SDANY(ScaleRowDown34_1_Box_Any_MSA, 1, 47) #endif +#ifdef HAS_SCALEROWDOWN34_LSX +SDANY(ScaleRowDown34_Any_LSX, + ScaleRowDown34_LSX, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_LSX, + ScaleRowDown34_0_Box_LSX, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_LSX, + ScaleRowDown34_1_Box_LSX, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif #ifdef HAS_SCALEROWDOWN38_SSSE3 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, @@ -356,6 +352,26 @@ SDANY(ScaleRowDown38_2_Box_Any_MSA, 1, 11) #endif +#ifdef HAS_SCALEROWDOWN38_LSX +SDANY(ScaleRowDown38_Any_LSX, + ScaleRowDown38_LSX, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_LSX, + ScaleRowDown38_3_Box_LSX, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_LSX, + ScaleRowDown38_2_Box_LSX, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 SDANY(ScaleARGBRowDown2_Any_SSE2, @@ -417,25 +433,25 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA, 4, 3) #endif -#ifdef HAS_SCALEARGBROWDOWN2_MMI -SDANY(ScaleARGBRowDown2_Any_MMI, - ScaleARGBRowDown2_MMI, +#ifdef HAS_SCALEARGBROWDOWN2_LSX +SDANY(ScaleARGBRowDown2_Any_LSX, + ScaleARGBRowDown2_LSX, ScaleARGBRowDown2_C, 2, 4, - 1) -SDANY(ScaleARGBRowDown2Linear_Any_MMI, - ScaleARGBRowDown2Linear_MMI, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_LSX, + ScaleARGBRowDown2Linear_LSX, ScaleARGBRowDown2Linear_C, 2, 4, - 1) -SDANY(ScaleARGBRowDown2Box_Any_MMI, - ScaleARGBRowDown2Box_MMI, + 3) +SDANY(ScaleARGBRowDown2Box_Any_LSX, + ScaleARGBRowDown2Box_LSX, ScaleARGBRowDown2Box_C, 2, 4, - 1) + 3) #endif #undef SDANY @@ -488,17 +504,24 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, 4, 3) #endif -#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI -SDAANY(ScaleARGBRowDownEven_Any_MMI, - ScaleARGBRowDownEven_MMI, +#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX +SDAANY(ScaleARGBRowDownEven_Any_LSX, + ScaleARGBRowDownEven_LSX, ScaleARGBRowDownEven_C, 4, - 1) -SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, - ScaleARGBRowDownEvenBox_MMI, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_LSX, + ScaleARGBRowDownEvenBox_LSX, ScaleARGBRowDownEvenBox_C, 4, - 1) + 3) +#endif +#ifdef HAS_SCALEUVROWDOWNEVEN_NEON +SDAANY(ScaleUVRowDownEven_Any_NEON, + ScaleUVRowDownEven_NEON, + ScaleUVRowDownEven_C, + 2, + 3) #endif #ifdef SASIMDONLY @@ -533,8 +556,8 @@ SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) #ifdef HAS_SCALEADDROW_MSA SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) #endif -#ifdef HAS_SCALEADDROW_MMI -SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) +#ifdef HAS_SCALEADDROW_LSX +SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15) #endif #undef SAANY @@ -562,13 +585,477 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #ifdef HAS_SCALEADDROW_MSA SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) #endif -#ifdef HAS_SCALEADDROW_MMI -SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) +#ifdef HAS_SCALEADDROW_LSX +SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15) #endif #undef SAANY #endif // SASIMDONLY +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEFILTERCOLS_LSX +CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBCOLS_LSX +CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_LSX +CANY(ScaleARGBFilterCols_Any_LSX, + ScaleARGBFilterCols_LSX, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#undef CANY + +// Scale up horizontally 2 times using linear filter. +#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 1, n); \ + } \ + C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ + } \ + dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \ + } + +// Even the C versions need to be wrapped, because boundary pixels have to +// be handled differently + +SUH2LANY(ScaleRowUp2_Linear_Any_C, + ScaleRowUp2_Linear_C, + ScaleRowUp2_Linear_C, + 0, + uint8_t) + +SUH2LANY(ScaleRowUp2_Linear_16_Any_C, + ScaleRowUp2_Linear_16_C, + ScaleRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, + ScaleRowUp2_Linear_SSE2, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, + ScaleRowUp2_Linear_SSSE3, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, + ScaleRowUp2_Linear_12_SSSE3, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 +SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, + ScaleRowUp2_Linear_AVX2, + ScaleRowUp2_Linear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 +SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, + ScaleRowUp2_Linear_12_AVX2, + ScaleRowUp2_Linear_16_C, + 31, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, + ScaleRowUp2_Linear_16_AVX2, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_NEON +SUH2LANY(ScaleRowUp2_Linear_Any_NEON, + ScaleRowUp2_Linear_NEON, + ScaleRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON +SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, + ScaleRowUp2_Linear_12_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON +SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, + ScaleRowUp2_Linear_16_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#undef SUH2LANY + +// Scale up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 1, db - da, n); \ + } \ + C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ + } \ + da[dst_width - 1] = \ + (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ + db[dst_width - 1] = \ + (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ + } + +SU2BLANY(ScaleRowUp2_Bilinear_Any_C, + ScaleRowUp2_Bilinear_C, + ScaleRowUp2_Bilinear_C, + 0, + uint8_t) + +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, + ScaleRowUp2_Bilinear_16_C, + ScaleRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, + ScaleRowUp2_Bilinear_SSE2, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, + ScaleRowUp2_Bilinear_12_SSSE3, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, + ScaleRowUp2_Bilinear_SSSE3, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, + ScaleRowUp2_Bilinear_AVX2, + ScaleRowUp2_Bilinear_C, + 31, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, + ScaleRowUp2_Bilinear_12_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, + ScaleRowUp2_Bilinear_16_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_NEON +SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, + ScaleRowUp2_Bilinear_NEON, + ScaleRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, + ScaleRowUp2_Bilinear_12_NEON, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, + ScaleRowUp2_Bilinear_16_NEON, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#undef SU2BLANY + +// Scale bi-planar plane up horizontally 2 times using linear filter. +#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + dst_ptr[1] = src_ptr[1]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 2, n); \ + } \ + C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ + } \ + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ + } + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, + ScaleUVRowUp2_Linear_C, + ScaleUVRowUp2_Linear_C, + 0, + uint8_t) + +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, + ScaleUVRowUp2_Linear_16_C, + ScaleUVRowUp2_Linear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, + ScaleUVRowUp2_Linear_SSSE3, + ScaleUVRowUp2_Linear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, + ScaleUVRowUp2_Linear_AVX2, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, + ScaleUVRowUp2_Linear_16_SSE41, + ScaleUVRowUp2_Linear_16_C, + 3, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, + ScaleUVRowUp2_Linear_16_AVX2, + ScaleUVRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, + ScaleUVRowUp2_Linear_NEON, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, + ScaleUVRowUp2_Linear_16_NEON, + ScaleUVRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + +#undef SBUH2LANY + +// Scale bi-planar plane up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ + db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ + da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ + db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 2, db - da, n); \ + } \ + C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ + } \ + da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ + sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ + 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ + 2; \ + da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ + sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ + 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ + 2; \ + } + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, + ScaleUVRowUp2_Bilinear_C, + ScaleUVRowUp2_Bilinear_C, + 0, + uint8_t) + +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, + ScaleUVRowUp2_Bilinear_16_C, + ScaleUVRowUp2_Bilinear_16_C, + 0, + uint16_t) + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, + ScaleUVRowUp2_Bilinear_SSSE3, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, + ScaleUVRowUp2_Bilinear_AVX2, + ScaleUVRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, + ScaleUVRowUp2_Bilinear_16_SSE41, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, + ScaleUVRowUp2_Bilinear_16_AVX2, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, + ScaleUVRowUp2_Bilinear_NEON, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, + ScaleUVRowUp2_Bilinear_16_NEON, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#undef SBU2BLANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc index beef380a..9c3acf7f 100644 --- a/files/source/scale_argb.cc +++ b/files/source/scale_argb.cc @@ -58,9 +58,9 @@ static void ScaleARGBDown2(int src_width, assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; } else { - src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4; } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) @@ -111,19 +111,19 @@ static void ScaleARGBDown2(int src_width, } } #endif -#if defined(HAS_SCALEARGBROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { +#if defined(HAS_SCALEARGBROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { ScaleARGBRowDown2 = filtering == kFilterNone - ? ScaleARGBRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI - : ScaleARGBRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 2)) { + ? ScaleARGBRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX + : ScaleARGBRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone - ? ScaleARGBRowDown2_MMI - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI - : ScaleARGBRowDown2Box_MMI); + ? ScaleARGBRowDown2_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX + : ScaleARGBRowDown2Box_LSX); } } #endif @@ -162,7 +162,7 @@ static void ScaleARGBDown4Box(int src_width, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; // Advance to odd row, even column. - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; (void)src_width; (void)src_height; (void)dx; @@ -214,7 +214,7 @@ static void ScaleARGBDownEven(int src_width, enum FilterMode filtering) { int j; int col_step = dx >> 16; - int row_stride = (dy >> 16) * src_stride; + int row_stride = (dy >> 16) * (int64_t)src_stride; void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; @@ -222,7 +222,7 @@ static void ScaleARGBDownEven(int src_width, (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 @@ -253,13 +253,13 @@ static void ScaleARGBDownEven(int src_width, } } #endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI - : ScaleARGBRowDownEven_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX + : ScaleARGBRowDownEven_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI; + filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX; } } #endif @@ -340,6 +340,14 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; @@ -361,6 +369,14 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. { @@ -372,7 +388,7 @@ static void ScaleARGBBilinearDown(int src_width, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8_t* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { @@ -444,11 +460,11 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -477,6 +493,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -498,11 +522,11 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (!filtering && TestCpuFlag(kCpuHasMMI)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBFilterCols = ScaleARGBCols_MMI; +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; } } #endif @@ -513,11 +537,6 @@ static void ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; - } -#endif } if (y > max_y) { @@ -526,7 +545,7 @@ static void ScaleARGBBilinearUp(int src_width, { int yi = y >> 16; - const uint8_t* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * (int64_t)src_stride; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; @@ -541,7 +560,9 @@ static void ScaleARGBBilinearUp(int src_width, src += src_stride; } ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; + if (src_height > 2) { + src += src_stride; + } for (j = 0; j < dst_height; ++j) { yi = y >> 16; @@ -549,14 +570,16 @@ static void ScaleARGBBilinearUp(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_argb + yi * src_stride; + src = src_argb + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; - src += src_stride; + if ((y + 65536) < max_y) { + src += src_stride; + } } } if (filtering == kFilterLinear) { @@ -611,6 +634,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == + (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(src_width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; @@ -627,6 +659,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGBRow = I422ToARGBRow_Any_LASX; + if (IS_ALIGNED(src_width, 32)) { + I422ToARGBRow = I422ToARGBRow_LASX; + } + } +#endif void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, @@ -663,6 +703,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = @@ -692,6 +740,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -713,11 +769,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (!filtering && TestCpuFlag(kCpuHasMMI)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBFilterCols = ScaleARGBCols_MMI; +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; } } #endif @@ -728,11 +784,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; - } -#endif } const int max_y = (src_height - 1) << 16; @@ -742,9 +793,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; - const uint8_t* src_row_y = src_y + yi * src_stride_y; - const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; - const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; + const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; @@ -782,9 +833,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, y = max_y; yi = y >> 16; uv_yi = yi >> kYShift; - src_row_y = src_y + yi * src_stride_y; - src_row_u = src_u + uv_yi * src_stride_u; - src_row_v = src_v + uv_yi * src_stride_v; + src_row_y = src_y + yi * (int64_t)src_stride_y; + src_row_u = src_u + uv_yi * (int64_t)src_stride_u; + src_row_v = src_v + uv_yi * (int64_t)src_stride_v; } if (yi != lasty) { // TODO(fbarchard): Convert the clipped region of row. @@ -857,11 +908,11 @@ static void ScaleARGBSimple(int src_width, } } #endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBCols = ScaleARGBCols_MMI; +#if defined(HAS_SCALEARGBCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_LSX; } } #endif @@ -872,16 +923,11 @@ static void ScaleARGBSimple(int src_width, ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBCols = ScaleARGBColsUp2_MMI; - } -#endif } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, - dx); + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride, + dst_width, x, dx); dst_argb += dst_stride; y += dy; } @@ -916,7 +962,7 @@ static void ScaleARGB(const uint8_t* src, // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * src_stride; + src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -931,7 +977,7 @@ static void ScaleARGB(const uint8_t* src, if (clip_y) { int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); - src += (clipf >> 16) * src_stride; + src += (clipf >> 16) * (int64_t)src_stride; dst += clip_y * dst_stride; } @@ -965,17 +1011,17 @@ static void ScaleARGB(const uint8_t* src, filtering = kFilterNone; if (dx == 0x10000 && dy == 0x10000) { // Straight copy. - ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, - dst, dst_stride, clip_width, clip_height); + ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4, + src_stride, dst, dst_stride, clip_width, clip_height); return; } } } } if (dx == 0x10000 && (x & 0xffff) == 0) { - // Arbitrary scale vertically, but unscaled vertically. + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst, x, y, dy, 4, filtering); + dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering); return; } if (filtering && dy < 65536) { diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc index 63690271..b02bdafd 100644 --- a/files/source/scale_common.cc +++ b/files/source/scale_common.cc @@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, } } +// Sample position: (O is src sample position, X is dst sample position) +// +// v dst_ptr at here v stop at here +// X O X X O X X O X X O X X O X +// ^ src_ptr at here +void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Sample position: (O is src sample position, X is dst sample position) +// +// src_ptr at here +// X v X X X X X X X X X +// O O O O O +// X X X X X X X X X X +// ^ dst_ptr at here ^ stop at here +// X X X X X X X X X X +// O O O O O +// X X X X X X X X X X +void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + +// Only suitable for at most 14 bit range. +void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; + dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; + } +} + +// Only suitable for at most 12bit range. +void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[2 * x + 0] = + (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; + d[2 * x + 1] = + (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 0] = + (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; + e[2 * x + 1] = + (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; + } +} + // Scales a single row of pixels using point sampling. void ScaleCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, @@ -677,18 +766,18 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> + (65536u / 9u) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> + (65536u / 9u) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> + (65536u / 6u) >> 16; src_ptr += 8; dst_ptr += 3; @@ -731,15 +820,15 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2]) * - (65536 / 6) >> + (65536u / 6u) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5]) * - (65536 / 6) >> + (65536u / 6u) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> + (65536u / 4u) >> 16; src_ptr += 8; dst_ptr += 3; @@ -776,6 +865,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr, } } +// ARGB scale row functions + void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -1018,6 +1109,351 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb, #undef BLENDERC #undef BLENDER +// UV scale row functions +// same as ARGB but 2 channels + +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; + dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += src_stepx * 2; + dst_uv += 2; + } +} + +void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -1029,7 +1465,7 @@ void ScalePlaneVertical(int src_height, int x, int y, int dy, - int bpp, + int bpp, // bytes per pixel. 4 for ARGB. enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; @@ -1075,11 +1511,11 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width_bytes, 8)) { - InterpolateRow = InterpolateRow_MMI; +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_LSX; } } #endif @@ -1097,6 +1533,7 @@ void ScalePlaneVertical(int src_height, y += dy; } } + void ScalePlaneVertical_16(int src_height, int dst_width, int dst_height, @@ -1107,7 +1544,7 @@ void ScalePlaneVertical_16(int src_height, int x, int y, int dy, - int wpp, + int wpp, /* words per pixel. normally 1 */ enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; @@ -1123,32 +1560,32 @@ void ScalePlaneVertical_16(int src_height, src_argb += (x >> 16) * wpp; #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; - if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_Any_SSE2; + if (IS_ALIGNED(dst_width_words, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; - if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_Any_SSSE3; + if (IS_ALIGNED(dst_width_words, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; - if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_16_Any_AVX2; + if (IS_ALIGNED(dst_width_words, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; - if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_Any_NEON; + if (IS_ALIGNED(dst_width_words, 8)) { InterpolateRow = InterpolateRow_16_NEON; } } @@ -1168,6 +1605,70 @@ void ScalePlaneVertical_16(int src_height, } } +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +void ScalePlaneVertical_16To8(int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int wpp, /* words per pixel. normally 1 */ + int scale, + enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions. + void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int scale, int dst_width, + int source_y_fraction) = InterpolateRow_16To8_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; + +#if defined(HAS_INTERPOLATEROW_16TO8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow_16To8 = InterpolateRow_16To8_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16TO8_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow_16To8 = InterpolateRow_16To8_AVX2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride, + scale, dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + // Simplify the filtering based on scale factors. enum FilterMode ScaleFilterReduce(int src_width, int src_height, @@ -1181,8 +1682,8 @@ enum FilterMode ScaleFilterReduce(int src_width, src_height = -src_height; } if (filtering == kFilterBox) { - // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. - if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { filtering = kFilterBilinear; } } @@ -1217,7 +1718,7 @@ int FixedDiv_C(int num, int div) { return (int)(((int64_t)(num) << 16) / div); } -// Divide num by div and return as 16.16 fixed point result. +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); } @@ -1260,14 +1761,14 @@ void ScaleSlope(int src_width, if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_width > 1) { + } else if (src_width > 1 && dst_width > 1) { *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } if (dst_height <= src_height) { *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_height > 1) { + } else if (src_height > 1 && dst_height > 1) { *dy = FixedDiv1(src_height, dst_height); *y = 0; } @@ -1276,7 +1777,7 @@ void ScaleSlope(int src_width, if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_width > 1) { + } else if (src_width > 1 && dst_width > 1) { *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } diff --git a/files/source/scale_dspr2.cc b/files/source/scale_dspr2.cc deleted file mode 100644 index ddedcbf4..00000000 --- a/files/source/scale_dspr2.cc +++ /dev/null @@ -1,668 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC MIPS DSPR2 -#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) - -void ScaleRowDown2_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - // TODO(fbarchard): Use odd pixels instead of even. - "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1| - "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9| - "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17| - "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t8, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t1, 8(%[dst]) \n" - "sw $t2, 12(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 16 \n" - - "2: \n" - "andi $t9, %[dst_width], 0xf \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t0, 1(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 2 \n" - "addiu $t9, $t9, -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) - : [dst_width] "r"(dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - const uint8* t = src_ptr + src_stride; - - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 - "bltz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 0(%[t]) \n" // |19|18|17|16| - "lw $t5, 4(%[t]) \n" // |23|22|21|20| - "lw $t6, 8(%[t]) \n" // |27|26|25|24| - "lw $t7, 12(%[t]) \n" // |31|30|29|28| - "addiu $t9, $t9, -1 \n" - "srl $t8, $t0, 16 \n" // |X|X|3|2| - "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| - "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| - "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| - "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| - "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 - "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 - "srl $t8, $t1, 16 \n" // |X|X|7|6| - "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| - "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| - "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| - "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| - "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 - "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 - "srl $t8, $t2, 16 \n" // |X|X|11|10| - "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| - "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| - "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| - "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| - "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 - "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 - "srl $t8, $t3, 16 \n" // |X|X|15|14| - "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| - "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| - "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| - "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| - "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 - "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 - "addiu %[src_ptr], %[src_ptr], 16 \n" - "addiu %[t], %[t], 16 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "sb $t1, 2(%[dst]) \n" - "sb $t5, 3(%[dst]) \n" - "sb $t2, 4(%[dst]) \n" - "sb $t6, 5(%[dst]) \n" - "sb $t3, 6(%[dst]) \n" - "sb $t7, 7(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 0x7 \n" // x = residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lwr $t1, 0(%[src_ptr]) \n" - "lwl $t1, 3(%[src_ptr]) \n" - "lwr $t2, 0(%[t]) \n" - "lwl $t2, 3(%[t]) \n" - "srl $t8, $t1, 16 \n" - "ins $t1, $t2, 16, 16 \n" - "ins $t2, $t8, 0, 16 \n" - "raddu.w.qb $t1, $t1 \n" - "raddu.w.qb $t2, $t2 \n" - "shra_r.w $t1, $t1, 2 \n" - "shra_r.w $t2, $t2, 2 \n" - "sb $t1, 0(%[dst]) \n" - "sb $t2, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -2 \n" - "addiu %[t], %[t], 4 \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 2 \n" - - "3: \n" - ".set pop \n" - - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t) - : [dst_width] "r"(dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void ScaleRowDown4_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| - "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| - "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| - "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| - "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2| - "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t5, 4(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 7 \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t1, 2(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -1 \n" - "sb $t1, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst) - : [dst_width] "r"(dst_width) - : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - const uint8* s2 = s1 + stride; - const uint8* s3 = s2 + stride; - - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 1 \n" - "andi $t8, %[dst_width], 1 \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 4(%[s1]) \n" // |23|22|21|20| - "lw $t6, 4(%[s2]) \n" // |27|26|25|24| - "lw $t7, 4(%[s3]) \n" // |31|30|29|28| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| - "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| - "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| - "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "add $t4, $t4, $t5 \n" - "add $t6, $t6, $t7 \n" - "add $t4, $t4, $t6 \n" - "shra_r.w $t0, $t0, 4 \n" - "shra_r.w $t4, $t4, 4 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[s3], %[s3], 8 \n" - "addiu $t9, $t9, -1 \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 2 \n" - "beqz $t8, 2f \n" - " nop \n" - - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "shra_r.w $t0, $t0, 4 \n" - "sb $t0, 0(%[dst]) \n" - - "2: \n" - ".set pop \n" - - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2), - [s3] "+r"(s3) - : [dst_width] "r"(dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void ScaleRowDown34_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| - "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| - "addiu %[dst_width], %[dst_width], -24 \n" - "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| - "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| - "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| - "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| - "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| - "prepend $t1, $t2, 8 \n" // |4|3|1|0| - "prepend $t3, $t4, 24 \n" // |15|13|12|11| - "prepend $t5, $t6, 8 \n" // |20|19|17|16| - "prepend $t7, $t8, 24 \n" // |31|29|28|27| - "sw $t1, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t3, 8(%[dst]) \n" - "sw $t5, 12(%[dst]) \n" - "sw $t9, 16(%[dst]) \n" - "sw $t7, 20(%[dst]) \n" - "bnez %[dst_width], 1b \n" - " addiu %[dst], %[dst], 24 \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"); -} - -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* d, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "repl.ph $t3, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| - "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t2, $t2, $t4 \n" - "addu.ph $t6, $t6, $t5 \n" - "sll $t5, $t0, 1 \n" - "add $t0, $t5, $t0 \n" - "shra_r.ph $t2, $t2, 2 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shll.ph $t4, $t2, 1 \n" - "addq.ph $t4, $t4, $t2 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.w $t0, $t0, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "shra_r.ph $t6, $t6, 2 \n" - "srl $t1, $t6, 16 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), - [dst_width] "+r"(dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); -} - -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* d, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "repl.ph $t2, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| - "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t4, $t4, $t3 \n" - "addu.ph $t6, $t6, $t5 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shra_r.ph $t4, $t4, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.ph $t6, $t6, 1 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "shra_r.w $t0, $t0, 1 \n" - "srl $t1, $t6, 16 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d), - [dst_width] "+r"(dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); -} - -void ScaleRowDown38_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst, - int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t6, $t6 \n" // |26|27|24|25| - "srl $t0, $t0, 8 \n" // |X|2|3|0| - "srl $t3, $t3, 16 \n" // |X|X|15|14| - "srl $t5, $t5, 16 \n" // |X|X|23|22| - "srl $t7, $t7, 16 \n" // |X|X|31|30| - "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| - "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| - "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| - "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| - "prepend $t2, $t3, 24 \n" // |X|15|14|11| - "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| - "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu %[dst_width], %[dst_width], -12 \n" - "addiu $t8,%[dst_width], -12 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t4, 4(%[dst]) \n" - "sw $t6, 8(%[dst]) \n" - "bgez $t8, 1b \n" - " addiu %[dst], %[dst], 12 \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); -} - -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - const uint8* t = src_ptr + stride; - const int c = 0x2AAA; - - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| - "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 - "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 - "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| - "srl $t4, $t4, 2 \n" // t4 / 4 - "srl $t6, $t6, 16 \n" // |0|0|S3|T3| - "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 - "addu $t6, $t5, $t6 \n" - "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 - "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 - "addu $t0, $t0, $t2 \n" - "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[t], %[t], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t4, -1(%[dst_ptr]) \n" - "sb $t6, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t), - [dst_width] "+r"(dst_width) - : [c] "r"(c) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6"); -} - -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - stride += stride; - const uint8* s2 = src_ptr + stride; - const int c1 = 0x1C71; - const int c2 = 0x2AAA; - - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| - "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| - "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| - "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 - "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 - "sll $t8, $t5, 16 \n" // |R5|R4|0|0| - "raddu.w.qb $t8, $t8 \n" // R5+R4 - "addu $t7, $t7, $t8 \n" - "srl $t8, $t5, 16 \n" // |0|0|R7|R6| - "raddu.w.qb $t8, $t8 \n" // R7 + R6 - "addu $t6, $t6, $t8 \n" - "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA - "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| - "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| - "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 - "addu $t7, $t7, $t8 \n" - "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t2, $t2 \n" - "raddu.w.qb $t4, $t4 \n" - "addu $t0, $t0, $t2 \n" - "addu $t0, $t0, $t4 \n" - "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t7, $t7, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t6, -1(%[dst_ptr]) \n" - "sb $t7, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1), - [s2] "+r"(s2), [dst_width] "+r"(dst_width) - : [c1] "r"(c1), [c2] "r"(c2) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"); -} - -void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - int x; - for (x = 0; x < ((src_width - 1)); x += 8) { - uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4; - uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8; - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - "lw %[tmp_t5], 0(%[src_ptr]) \n" - "lw %[tmp_t6], 4(%[src_ptr]) \n" - "lw %[tmp_t1], 0(%[dst_ptr]) \n" - "lw %[tmp_t2], 4(%[dst_ptr]) \n" - "lw %[tmp_t3], 8(%[dst_ptr]) \n" - "lw %[tmp_t4], 12(%[dst_ptr]) \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n" - "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n" - "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n" - "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n" - "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n" - "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n" - "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n" - "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n" - "sw %[tmp_t1], 0(%[dst_ptr]) \n" - "sw %[tmp_t2], 4(%[dst_ptr]) \n" - "sw %[tmp_t3], 8(%[dst_ptr]) \n" - "sw %[tmp_t4], 12(%[dst_ptr]) \n" - ".set pop \n" - : - [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3), - [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6), - [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr) - : [dst_ptr] "r"(dst_ptr)); - src_ptr += 8; - dst_ptr += 8; - } - - if ((src_width)&7) { - for (x = 0; x < ((src_width - 1) & 7); x += 1) { - dst_ptr[0] += src_ptr[0]; - src_ptr += 1; - dst_ptr += 1; - } - } -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc index 90a49f30..edaf2e29 100644 --- a/files/source/scale_gcc.cc +++ b/files/source/scale_gcc.cc @@ -17,8 +17,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, @@ -102,16 +101,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -125,25 +124,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -156,33 +155,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -196,27 +195,25 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, @@ -225,26 +222,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -258,34 +255,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -301,24 +298,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -332,46 +329,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, int dst_width) { intptr_t stridex3; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea 0x00(%4,%4,2),%3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -387,26 +384,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -420,46 +417,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -476,37 +473,35 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kShuf0), // %0 "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, @@ -514,65 +509,63 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, @@ -580,69 +573,67 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, @@ -651,23 +642,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -681,44 +672,43 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" : : "m"(kShufAb0), // %0 "m"(kShufAb1), // %1 "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, @@ -726,126 +716,1105 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" : : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, + 10, 11, 8, 9, 14, 15, 12, 13}; + +static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + +#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { asm volatile( + "pxor %%xmm0,%%xmm0 \n" // 0 + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $1,%%xmm6 \n" // all 2 LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm6,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" + "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) + "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm6,%%xmm1 \n" + "paddw %%xmm3,%%xmm3 \n" + "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + LABELALIGN + "1: \n" + "pxor %%xmm0,%%xmm0 \n" // 0 + // above line + "movq (%0),%%xmm1 \n" // 01234567 + "movq 1(%0),%%xmm2 \n" // 12345678 + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) + "paddw %%xmm5,%%xmm4 \n" // near+far + "movdqa %%xmm3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) + "paddw %%xmm5,%%xmm5 \n" // 2*near + "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) + + "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm2,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + // below line + "movq (%0,%3),%%xmm6 \n" // 01234567 + "movq 1(%0,%3),%%xmm2 \n" // 12345678 + "movdqa %%xmm6,%%xmm3 \n" + "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 + "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 + "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 + + "movdqa %%xmm6,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) + "movdqa %%xmm2,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) + "paddw %%xmm7,%%xmm5 \n" // near+far + "movdqa %%xmm3,%%xmm7 \n" + "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) + "paddw %%xmm7,%%xmm7 \n" // 2*near + "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) + + "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) + "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) + "paddw %%xmm6,%%xmm2 \n" // near+far + "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) + + // xmm4 xmm1 + // xmm5 xmm2 + "pcmpeqw %%xmm0,%%xmm0 \n" + "psrlw $15,%%xmm0 \n" + "psllw $3,%%xmm0 \n" // all 8 + + "movdqa %%xmm4,%%xmm3 \n" + "movdqa %%xmm5,%%xmm6 \n" + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) + "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + + "movdqa %%xmm1,%%xmm7 \n" + "movdqa %%xmm2,%%xmm6 \n" + "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) + "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) + "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm7 \n" // ^ div by 16 + + "packuswb %%xmm7,%%xmm3 \n" + "movdqu %%xmm3,(%1) \n" // save above line + + "movdqa %%xmm5,%%xmm3 \n" + "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 + + "movdqa %%xmm2,%%xmm3 \n" + "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) + "psrlw $4,%%xmm2 \n" // ^ div by 16 + + "packuswb %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // save below line + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } +#endif -// Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { +#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %3,%%xmm5 \n" + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) + + "paddw %%xmm4,%%xmm1 \n" // far+2 + "paddw %%xmm4,%%xmm3 \n" // far+2 + "paddw %%xmm0,%%xmm1 \n" // near+far+2 + "paddw %%xmm2,%%xmm3 \n" // near+far+2 + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) + + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,16(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { asm volatile( + "pcmpeqw %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" + "psllw $3,%%xmm7 \n" // all 8 + "movdqa %5,%%xmm6 \n" - "pxor %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + // above line + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) + "paddw %%xmm0,%%xmm1 \n" // near+far + "paddw %%xmm2,%%xmm3 \n" // near+far + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) + + // below line + "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) + "movdqa %%xmm1,%%xmm3 \n" + "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) + "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) + "movdqa %%xmm3,%%xmm5 \n" + "movdqa %%xmm1,%%xmm4 \n" + "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) + "paddw %%xmm1,%%xmm4 \n" // near+far + "paddw %%xmm3,%%xmm5 \n" // near+far + "paddw %%xmm1,%%xmm1 \n" // 2*near + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,(%1) \n" + + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,0x10(%1) \n" + + "movdqa %%xmm1,%%xmm4 \n" + "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) + "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm1 \n" // ^ div by 16 + "movdqu %%xmm1,(%1,%4,2) \n" + + "movdqa %%xmm3,%%xmm4 \n" + "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm3 \n" // ^ div by 16 + "movdqu %%xmm3,0x10(%1,%4,2) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 - // 16 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packssdw %%xmm1,%%xmm0 \n" + "pshufd $0b11011000,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 + "+r"(dst_width) // %2 : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } +#endif -#ifdef HAS_SCALEADDROW_AVX2 -// Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { +#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + "paddd %%xmm0,%%xmm2 \n" // near+far (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 2(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) + "paddd %%xmm2,%%xmm4 \n" // near+far (lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packssdw %%xmm0,%%xmm4 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packssdw %%xmm2,%%xmm5 \n" + "pshufd $0b11011000,%%xmm5,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 01234567 + "movq 1(%0),%%xmm1 \n" // 12345678 + "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 + "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 + "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 1(%0,%3),%%xmm4 \n" + "punpcklwd %%xmm1,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm4 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm4,%%xmm3 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 +void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 +void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF + "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vbroadcastf128 %3,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) + "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 + + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 + "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 + + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far + "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,32(%1) \n" + + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) + + "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) + "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) + + "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) + "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1) \n" // store above + + "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) + "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) + "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 + "vmovdqu %%ymm0,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 sample to 16 sample + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 + "+r"(dst_width) // %2 : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +// Reads 16xN bytes and produces 16 shorts at a time. +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile("pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); +} + +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 @@ -866,69 +1835,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int dx) { intptr_t x0, x1, temp_pixel; asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" LABELALIGN "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 @@ -962,26 +1931,24 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, @@ -989,23 +1956,21 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, @@ -1013,56 +1978,52 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. @@ -1076,23 +2037,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; (void)src_stride; asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" LABELALIGN "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1113,32 +2074,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 @@ -1156,56 +2117,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" LABELALIGN "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" "99: \n" : "=&a"(x0), // %0 "=&d"(x1), // %1 @@ -1226,26 +2187,24 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm volatile( - - LABELALIGN + asm volatile(LABELALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw @@ -1267,63 +2226,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int dx) { intptr_t x0, x1; asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" LABELALIGN "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN "99: \n" // clang-format error. + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" + + LABELALIGN + "99: \n" // clang-format error. : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -1339,10 +2299,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int FixedDiv_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); @@ -1353,19 +2313,637 @@ int FixedDiv_X86(int num, int div) { int FixedDiv1_X86(int num, int div) { asm volatile( "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); return num; } +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ + defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + +// Shuffle table for splitting UV into upper and lower part of register. +static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; +static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, + 6u, 14u, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}; +#endif + +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 + +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 + +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vbroadcastf128 %4,%%ymm1 \n" // split shuffler + "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 + +static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, + 3, 1, 3, 1, 1, 3, 1, 3}; + +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqa %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqa %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 2(%0,%3),%%xmm4 \n" + "punpcklbw %%xmm4,%%xmm1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm1,%%xmm3 \n" + "punpckldq %%xmm1,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 + +void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vbroadcastf128 %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vbroadcastf128 %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) + "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packusdw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 4(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) + "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) + "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packusdw %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packusdw %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/files/source/scale_lsx.cc b/files/source/scale_lsx.cc new file mode 100644 index 00000000..bfe5e9fb --- /dev/null +++ b/files/source/scale_lsx.cc @@ -0,0 +1,739 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "libyuv/scale_row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_DATA(_src, _in, _out) \ + { \ + int _tmp1, _tmp2, _tmp3, _tmp4; \ + DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \ + _tmp2, _tmp3, _tmp4); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \ + } + +void ScaleARGBRowDown2_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vpickod_w(src1, src0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_w(src1, src0); + tmp1 = __lsx_vpickod_w(src1, src0); + dst0 = __lsx_vavgr_bu(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1); + DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2, + shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + __lsx_vst(dst0, dst_argb, 0); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx << 2; + (void)src_stride; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + dst0 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst1 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst2 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst3 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + __lsx_vstelm_w(dst0, dst_argb, 0, 0); + __lsx_vstelm_w(dst1, dst_argb, 4, 0); + __lsx_vstelm_w(dst2, dst_argb, 8, 0); + __lsx_vstelm_w(dst3, dst_argb, 12, 0); + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx * 4; + const uint8_t* next_argb = src_argb + src_stride; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, dst0; + + for (x = 0; x < len; x++) { + tmp0 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp1 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp2 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp3 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp4 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp5 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp6 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp7 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5); + DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + dst0 = __lsx_vshuf4i_b(dst0, 0xD8); + __lsx_vst(dst0, dst_argb, 0); + dst_argb += 16; + } +} + +void ScaleRowDown2_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + const uint8_t* src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + src_nex += 64; + dst += 32; + } +} + +void ScaleRowDown4_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + const uint8_t* ptr3 = ptr2 + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, + src7, tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, reg0, reg1, reg2, reg3); + DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + ptr1 += 64; + ptr2 += 64; + ptr3 += 64; + dst += 16; + } +} + +void ScaleRowDown38_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, len; + __m128i src0, src1, tmp0; + __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816}; + + assert(dst_width % 3 == 0); + len = dst_width / 12; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + tmp0 = __lsx_vshuf_b(src1, src0, shuff); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst, 8, 2); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA); + __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x4000); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + src_nex += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + __m128i src0, src1, src2, src3, src4, src5; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i zero = __lsx_vldi(0); + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71); + __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1, + src2, src3); + DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6); + DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + ptr1 += 32; + ptr2 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + int len = src_width / 16; + __m128i src0, tmp0, tmp1, dst0, dst1; + __m128i zero = __lsx_vldi(0); + + assert(src_width > 0); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1); + tmp0 = __lsx_vilvl_b(zero, src0); + tmp1 = __lsx_vilvh_b(zero, src0); + DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + int len = dst_width / 16; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF); + __m128i const2 = __lsx_vreplgr2vr_w(0x40); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + tmp4 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + tmp5 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp2 = __lsx_vsrai_w(vec_x, 16); + tmp6 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp3 = __lsx_vsrai_w(vec_x, 16); + tmp7 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5, + tmp6, tmp7); + LOAD_DATA(src_ptr, tmp0, reg0); + LOAD_DATA(src_ptr, tmp1, reg1); + LOAD_DATA(src_ptr, tmp2, reg2); + LOAD_DATA(src_ptr, tmp3, reg3); + DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1, + tmp2, tmp3); + LOAD_DATA(src_ptr, tmp0, reg4); + LOAD_DATA(src_ptr, tmp1, reg5); + LOAD_DATA(src_ptr, tmp2, reg6); + LOAD_DATA(src_ptr, tmp3, reg7); + DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7, + const2, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5, + reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + uint32_t* dst = (uint32_t*)dst_argb; + int j; + int len = dst_width / 4; + __m128i tmp0, tmp1, tmp2, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + tmp0 = __lsx_vmul_w(vec_dx, const_tmp); + tmp1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, tmp0); + + for (j = 0; j < len; j++) { + tmp2 = __lsx_vsrai_w(vec_x, 16); + vec_x = __lsx_vadd_w(vec_x, tmp1); + LOAD_DATA(src, tmp2, dst0); + __lsx_vst(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + int j; + int len = dst_width / 8; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0, dst1; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + __m128i const_7f = __lsx_vldi(0x7F); + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + reg0 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + reg1 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1); + DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1); + DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6); + DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7); + LOAD_DATA(src, tmp0, src0); + LOAD_DATA(src, tmp1, src1); + DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1); + LOAD_DATA(src, tmp0, src2); + LOAD_DATA(src, tmp1, src3); + DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6); + DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2; + __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B}; + __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110}; + __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0, + dst1); + dst2 = __lsx_vshuf_b(src3, src2, shuff2); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, + const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6, + tmp7, tmp8); + DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10); + DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 2); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11, + const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 1); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc index 990463c2..1226ef3e 100644 --- a/files/source/scale_mmi.cc +++ b/files/source/scale_mmi.cc @@ -1103,6 +1103,61 @@ void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, : "memory"); } +void ScaleRowDown34_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint64_t src[2]; + uint64_t tmp[2]; + __asm__ volatile ( + "1: \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "and %[tmp1], %[src0], %[mask1] \n\t" + "psrlw %[tmp0], %[src0], %[rmov] \n\t" + "psllw %[tmp0], %[tmp0], %[lmov1] \n\t" + "or %[src0], %[tmp0], %[tmp1] \n\t" + "punpckhwd %[tmp0], %[src0], %[src0] \n\t" + "psllw %[tmp1], %[tmp0], %[rmov] \n\t" + "or %[src0], %[src0], %[tmp1] \n\t" + "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t" + "pextrh %[tmp0], %[tmp0], %[zero] \n\t" + "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t" + "pextrh %[tmp0], %[src1], %[zero] \n\t" + "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t" + + "punpckhwd %[tmp0], %[src1], %[src1] \n\t" + "pextrh %[tmp1], %[tmp0], %[zero] \n\t" + "psrlw %[src1], %[src1], %[rmov] \n\t" + "psllw %[tmp1], %[tmp1], %[rmov8] \n\t" + "or %[src1], %[src1], %[tmp1] \n\t" + "and %[tmp0], %[tmp0], %[mask2] \n\t" + "or %[src1], %[src1], %[tmp0] \n\t" + + "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t" + "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t" + "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t" + + "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x0c \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" + "bnez %[width], 1b \n\t" + + : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]), + [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1]) + : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst), + [lmov]"f"(0xc), [rmov]"f"(0x18), + [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8), + [zero]"f"(0x0), [mask2]"f"(0xff000000), + [width]"r"(dst_width), [lmov1]"f"(0x10) + : "memory" + ); +} // clang-format on #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc index 366b155b..6a0d6e1b 100644 --- a/files/source/scale_neon.cc +++ b/files/source/scale_neon.cc @@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into q0, odd into q1 - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + // row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and // pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [%3]! \n" - "vld1.8 {q2}, [%4]! \n" - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + // (3 * line_0 + line_1 + 2) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "vld1.8 {q3}, [%3] \n" - "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride * 2; asm volatile( - "vld1.16 {q13}, [%5] \n" - "vld1.8 {q14}, [%6] \n" - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + "vmovn.u16 d4, q2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + "vqrshrn.u16 d4, q2, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" // combine source lines - "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q3 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n @@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -504,6 +504,484 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u8 d30, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%3]! \n" // 12345678 + + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) + + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.8 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%5]! \n" // 12345678 + + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) + + "vld1.8 {d8}, [%1]! \n" + "vld1.8 {d9}, [%6]! \n" + + "vmovl.u8 q2, d8 \n" + "vmovl.u8 q3, d9 \n" + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.8 {d0, d1}, [%2]! \n" // store + "vst2.8 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", + "q15" // Clobber List + ); +} + +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 q15, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) + + "vmovq q2, q0 \n" + "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) + "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) + + "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) + "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) + + "vmovq q4, q2 \n" + "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) + "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + "vrshr.u16 q2, q1, #4 \n" // 2, even + "vrshr.u16 q3, q0, #4 \n" // 2, odd + "vrshr.u16 q0, q5, #4 \n" // 1, even + "vrshr.u16 q1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store + "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store + "subs %4, %4, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q15" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 d31, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) + + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 4567 (32b) + "vmovl.u16 q4, d2 \n" // 1234 (32b) + "vmovl.u16 q5, d3 \n" // 5678 (32b) + + "vmlal.u16 q2, d2, d31 \n" + "vmlal.u16 q3, d3, d31 \n" + "vmlal.u16 q4, d0, d31 \n" + "vmlal.u16 q5, d1, d31 \n" + + "vrshrn.u32 d0, q4, #2 \n" + "vrshrn.u32 d1, q5, #2 \n" + "vrshrn.u32 d2, q2, #2 \n" + "vrshrn.u32 d3, q3, #2 \n" + + "vst2.16 {q0, q1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 d31, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.16 {d0}, [%0]! \n" // 0123 (16b) + "vld1.16 {d1}, [%5]! \n" // 1234 (16b) + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 1234 (32b) + "vmlal.u16 q2, d1, d31 \n" + "vmlal.u16 q3, d0, d31 \n" + + "vld1.16 {d0}, [%1]! \n" // 0123 (16b) + "vld1.16 {d1}, [%6]! \n" // 1234 (16b) + "vmovl.u16 q4, d0 \n" // 0123 (32b) + "vmovl.u16 q5, d1 \n" // 1234 (32b) + "vmlal.u16 q4, d1, d31 \n" + "vmlal.u16 q5, d0, d31 \n" + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" + "vmla.u32 q5, q3, q14 \n" + "vmla.u32 q2, q0, q14 \n" + "vmla.u32 q3, q1, q14 \n" + + "vrshrn.u32 d1, q4, #4 \n" + "vrshrn.u32 d0, q5, #4 \n" + "vrshrn.u32 d3, q2, #4 \n" + "vrshrn.u32 d2, q3, #4 \n" + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 sample -> 8 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u8 d30, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) + + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) + + "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) + "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) + + "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", + "q15" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u16 d30, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) + "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) + + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) + "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) + "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) + "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) + "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) + "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) + + "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) + "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.32 {d0, d1}, [%1]! \n" // store + "vst2.32 {d2, d3}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 d30, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) + + "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) + "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) + "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) + "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) + "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) + "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) + + "vrshrn.u32 d1, q4, #4 \n" // 1, odd + "vrshrn.u32 d0, q5, #4 \n" // 1, even + "vrshrn.u32 d3, q2, #4 \n" // 2, odd + "vrshrn.u32 d2, q3, #4 \n" // 2, even + + "vst2.32 {d0, d1}, [%2]! \n" // store + "vst2.32 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #4 \n" // 2 uv -> 4 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d30" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, @@ -511,13 +989,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "vld1.16 {q1, q2}, [%1] \n" // load accumulator - "vld1.8 {q0}, [%0]! \n" // load 16 bytes - "vaddw.u8 q2, q2, d1 \n" // add - "vaddw.u8 q1, q1, d0 \n" - "vst1.16 {q1, q2}, [%1]! \n" // store accumulator - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" + "vld1.16 {q1, q2}, [%1] \n" // load accumulator + "vld1.8 {q0}, [%0]! \n" // load 16 bytes + "vaddw.u8 q2, q2, d1 \n" // add + "vaddw.u8 q1, q1, d0 \n" + "vst1.16 {q1, q2}, [%1]! \n" // store accumulator + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -547,17 +1025,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int* tmp = dx_offset; const uint8_t* src_tmp = src_ptr; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q3, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q1, q1, q0 \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "vadd.s32 q2, q1, q3 \n" - "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -566,27 +1044,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "vmov q10, q1 \n" - "vmov q11, q2 \n" - "vuzp.16 q10, q11 \n" - "vmovl.u8 q8, d6 \n" - "vmovl.u8 q9, d7 \n" - "vsubl.s16 q11, d18, d16 \n" - "vsubl.s16 q12, d19, d17 \n" - "vmovl.u16 q13, d20 \n" - "vmovl.u16 q10, d21 \n" - "vmul.s32 q11, q11, q13 \n" - "vmul.s32 q12, q12, q10 \n" - "vrshrn.s32 d18, q11, #16 \n" - "vrshrn.s32 d19, q12, #16 \n" - "vadd.s16 q8, q8, q9 \n" - "vmovn.s16 d6, q8 \n" - - "vst1.8 {d6}, [%0]! \n" // store pixels - "vadd.s32 q1, q1, q0 \n" - "vadd.s32 q2, q2, q0 \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vrshrn.s32 d18, q11, #16 \n" + "vrshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -609,75 +1087,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "vld1.8 {q1}, [%1]! \n" - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" "99: \n" - "vst1.8 {d1[7]}, [%0] \n" + "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -694,12 +1172,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vmov q2, q1 \n" // load next 8 ARGB - "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -722,13 +1200,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vrhadd.u8 q1, q2, q3 \n" // rounding half add - "vst2.32 {q0, q1}, [%1]! \n" - "bgt 1b \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -743,27 +1221,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -781,15 +1259,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "mov r12, %3, lsl #2 \n" - "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -805,30 +1283,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -865,8 +1343,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(d3, 1) // clang-format on "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -897,16 +1375,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int* tmp = dx_offset; const uint8_t* src_tmp = src_argb; asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q9, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - "vmov.i8 q3, #0x7f \n" // 0x7F - "vmov.i16 q15, #0x7f \n" // 0x7F + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q8, q1, q0 \n" - "1: \n" + "vadd.s32 q8, q1, q0 \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(d0, d2, 0) @@ -950,6 +1428,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. + "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV + "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV + "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vst2.8 {d0, d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q8", "q9"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "vld1.16 {d0[0]}, [%0], %6 \n" + "vld1.16 {d0[1]}, [%1], %6 \n" + "vld1.16 {d0[2]}, [%2], %6 \n" + "vld1.16 {d0[3]}, [%3], %6 \n" + "subs %5, %5, #4 \n" // 4 pixels per loop. + "vst1.8 {d0}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"(src_stepx * 8) // %6 + : "memory", "cc", "d0"); +} + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc index 0a7b80ce..9f9636e6 100644 --- a/files/source/scale_neon64.cc +++ b/files/source/scale_neon64.cc @@ -29,10 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -50,11 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -70,19 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.8h, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uadalp v0.8h, v2.16b \n" + "prfm pldl1keep, [%3, 448] \n" + "uadalp v0.8h, v3.16b \n" + "prfm pldl1keep, [%4, 448] \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_ptr1), // %2 @@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" - - // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" - - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - - "b.gt 1b \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // (3 * line_0 + line_1 + 2) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" - - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" - - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + "prfm pldl1keep, [%3, 448] \n" + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, // registers are already expanded. Then do transposes // to get aligned. // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + "add v0.8h, v0.8h, v16.8h \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" // Align for table lookup, vtbl requires registers to be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; asm volatile( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + "uqrshrn v2.8b, v2.8h, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 @@ -515,6 +535,488 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "v19", "v30", "v31", "memory", "cc"); } +void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%1], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 1; + const uint8_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 01234567 + "ldr d1, [%2], #8 \n" // 12345678 + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) + "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 + "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v2.16b, v0.16b \n" + "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) + "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) + + "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) + "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "mov v0.16b, v2.16b \n" + "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) + "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) + "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "mov v0.16b, v4.16b \n" + "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) + "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) + + "urshr v2.8h, v2.8h, #4 \n" // 2, odd + "urshr v1.8h, v3.8h, #4 \n" // 2, even + "urshr v4.8h, v4.8h, #4 \n" // 1, odd + "urshr v3.8h, v5.8h, #4 \n" // 1, even + + "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 + "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 + + "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List + ); +} + +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) + "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) + "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) + + "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) + + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 0123 (16b) + "ldr d1, [%2], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" // 0123 (16b) + "ldr d1, [%3], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far + + "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 + "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 + + "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 00112233 (1u1v) + "ldr d1, [%1], #8 \n" // 11223344 (1u1v) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) + "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" + "ushll v3.8h, v1.8b, #0 \n" + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" + "ushll v5.8h, v1.8b, #0 \n" + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 + "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 + "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) + "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) + + "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store + "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v2.4s, #4 \n" // 2, odd + "rshrn v0.4h, v3.4s, #4 \n" // 2, even + "rshrn v3.4h, v4.4s, #4 \n" // 1, odd + "rshrn v2.4h, v5.4s, #4 \n" // 1, even + + "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 + "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 + "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, @@ -522,13 +1024,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, int src_width) { asm volatile( "1: \n" - "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator - "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes - "uaddw2 v2.8h, v2.8h, v0.16b \n" // add - "uaddw v1.8h, v1.8h, v0.8b \n" - "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator - "subs %w2, %w2, #16 \n" // 16 processed per loop - "b.gt 1b \n" + "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator + "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes + "uaddw2 v2.8h, v2.8h, v0.16b \n" // add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v1.8h, v1.8h, v0.8b \n" + "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator + "subs %w2, %w2, #16 \n" // 16 processed per loop + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -560,17 +1063,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v3.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v1.4s, v1.4s, v0.4s \n" + "add v1.4s, v1.4s, v0.4s \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "add v2.4s, v1.4s, v3.4s \n" - "shl v0.4s, v3.4s, #1 \n" // 8 * dx - "1: \n" + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -579,27 +1082,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) - "mov v6.16b, v1.16b \n" - "mov v7.16b, v2.16b \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "ushll v4.8h, v4.8b, #0 \n" - "ushll v5.8h, v5.8b, #0 \n" - "ssubl v16.4s, v5.4h, v4.4h \n" - "ssubl2 v17.4s, v5.8h, v4.8h \n" - "ushll v7.4s, v6.4h, #0 \n" - "ushll2 v6.4s, v6.8h, #0 \n" - "mul v16.4s, v16.4s, v7.4s \n" - "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" - "add v4.8h, v4.8h, v6.8h \n" - "xtn v4.8b, v4.8h \n" - - "st1 {v4.8b}, [%0], #8 \n" // store pixels - "add v1.4s, v1.4s, v0.4s \n" - "add v2.4s, v2.4s, v0.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -623,74 +1126,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" // General purpose row blend. "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" // Blend 25 / 75. "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" // Blend 50 / 50. "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" // Blend 75 / 25. "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" - "st1 {v0.b}[15], [%0] \n" + "st1 {v0.b}[15], [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -709,11 +1221,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "mov v2.16b, v3.16b \n" - "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -730,13 +1243,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "urhadd v1.16b, v2.16b, v3.16b \n" - "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -751,25 +1265,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -788,13 +1304,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 @@ -812,33 +1329,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "add %1, %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "prfm pldl1keep, [%1, 448] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 @@ -875,10 +1394,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, LOAD1_DATA32_LANE(v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead // clang-format on - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -911,16 +1431,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v6.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - "movi v3.16b, #0x7f \n" // 0x7F - "movi v4.8h, #0x7f \n" // 0x7F + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v5.4s, v1.4s, v0.4s \n" - "1: \n" + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(v0, v1, 0) @@ -941,15 +1461,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, "umull2 v17.8h, v0.16b, v7.16b \n" "umull v18.8h, v1.8b, v2.8b \n" "umull2 v19.8h, v1.16b, v2.16b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "add v16.8h, v16.8h, v18.8h \n" "add v17.8h, v17.8h, v19.8h \n" "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" - "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -972,19 +1492,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, int dst_width) { asm volatile( // change the stride to row 2 pointer - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" - "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #8 \n" // 8 processed per loop - "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent - "uaddlp v1.4s, v1.8h \n" - "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent - "uadalp v1.4s, v3.8h \n" - "rshrn v0.4h, v0.4s, #2 \n" // round and pack - "rshrn2 v0.8h, v1.4s, #2 \n" - "st1 {v0.8h}, [%2], #16 \n" - "b.gt 1b \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1001,38 +1523,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, uint16_t* dst, int dst_width) { asm volatile( - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "movi v0.8h, #9 \n" // constants - "movi v1.4s, #3 \n" + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" "1: \n" - "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 - "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 - "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row - "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 - "subs %w3, %w3, #16 \n" // 16 dst pixels per loop - "umull v16.4s, v3.4h, v0.4h \n" - "umull2 v7.4s, v3.8h, v0.8h \n" - "umull v18.4s, v4.4h, v0.4h \n" - "umull2 v17.4s, v4.8h, v0.8h \n" - "uaddw v16.4s, v16.4s, v6.4h \n" - "uaddl2 v19.4s, v6.8h, v3.8h \n" - "uaddl v3.4s, v6.4h, v3.4h \n" - "uaddw2 v6.4s, v7.4s, v6.8h \n" - "uaddl2 v7.4s, v5.8h, v4.8h \n" - "uaddl v4.4s, v5.4h, v4.4h \n" - "uaddw v18.4s, v18.4s, v5.4h \n" - "mla v16.4s, v4.4s, v1.4s \n" - "mla v18.4s, v3.4s, v1.4s \n" - "mla v6.4s, v7.4s, v1.4s \n" - "uaddw2 v4.4s, v17.4s, v5.8h \n" - "uqrshrn v16.4h, v16.4s, #4 \n" - "mla v4.4s, v19.4s, v1.4s \n" - "uqrshrn2 v16.8h, v6.4s, #4 \n" - "uqrshrn v17.4h, v18.4s, #4 \n" - "uqrshrn2 v17.8h, v4.4s, #4 \n" - "st2 {v16.8h-v17.8h}, [%2], #32 \n" - "b.gt 1b \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -1044,6 +1568,64 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. + "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 + "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "prfm pldl1keep, [%1, 448] \n" + "rshrn v1.8b, v1.8h, #2 \n" + "st2 {v0.8b,v1.8b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v16", "v17"); +} + +// Reads 4 pixels at a time. +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, // pixel step + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src1_ptr = src_ptr + src_stepx * 2; + const uint8_t* src2_ptr = src_ptr + src_stepx * 4; + const uint8_t* src3_ptr = src_ptr + src_stepx * 6; + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.h}[0], [%0], %6 \n" + "ld1 {v1.h}[0], [%1], %6 \n" + "ld1 {v2.h}[0], [%2], %6 \n" + "ld1 {v3.h}[0], [%3], %6 \n" + "subs %w5, %w5, #4 \n" // 4 pixels per loop. + "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src1_ptr), // %1 + "+r"(src2_ptr), // %2 + "+r"(src3_ptr), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_width) // %5 + : "r"((int64_t)(src_stepx * 8)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/scale_rgb.cc b/files/source/scale_rgb.cc new file mode 100644 index 00000000..8db59b56 --- /dev/null +++ b/files/source/scale_rgb.cc @@ -0,0 +1,66 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" /* For FilterMode */ + +#include <assert.h> +#include <string.h> + +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from_argb.h" +#include "libyuv/row.h" +#include "libyuv/scale_argb.h" +#include "libyuv/scale_rgb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Scale a 24 bit image. +// Converts to ARGB as intermediate step + +LIBYUV_API +int RGBScale(const uint8_t* src_rgb, + int src_stride_rgb, + int src_width, + int src_height, + uint8_t* dst_rgb, + int dst_stride_rgb, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int r; + uint8_t* src_argb = + (uint8_t*)malloc(src_width * src_height * 4 + dst_width * dst_height * 4); + uint8_t* dst_argb = src_argb + src_width * src_height * 4; + + if (!src_argb) { + return 1; + } + + r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width, + src_height); + if (!r) { + r = ARGBScale(src_argb, src_width * 4, src_width, src_height, dst_argb, + dst_width * 4, dst_width, dst_height, filtering); + if (!r) { + r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb, + dst_width, dst_height); + } + } + free(src_argb); + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/scale_uv.cc b/files/source/scale_uv.cc new file mode 100644 index 00000000..3b3d7b8e --- /dev/null +++ b/files/source/scale_uv.cc @@ -0,0 +1,1161 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyUV +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Macros to enable specialized scalers + +#ifndef HAS_SCALEUVDOWN2 +#define HAS_SCALEUVDOWN2 1 +#endif +#ifndef HAS_SCALEUVDOWN4BOX +#define HAS_SCALEUVDOWN4BOX 1 +#endif +#ifndef HAS_SCALEUVDOWNEVEN +#define HAS_SCALEUVDOWNEVEN 1 +#endif +#ifndef HAS_SCALEUVBILINEARDOWN +#define HAS_SCALEUVBILINEARDOWN 1 +#endif +#ifndef HAS_SCALEUVBILINEARUP +#define HAS_SCALEUVBILINEARUP 1 +#endif +#ifndef HAS_UVCOPY +#define HAS_UVCOPY 1 +#endif +#ifndef HAS_SCALEPLANEVERTICAL +#define HAS_SCALEPLANEVERTICAL 1 +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleUV, 1/2 +// This is an optimized version for scaling down a UV to 1/2 of +// its original size. +#if HAS_SCALEUVDOWN2 +static void ScaleUVDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + filtering == kFilterNone + ? ScaleUVRowDown2_C + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C + : ScaleUVRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; + } else { + src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2; + } + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 + : ScaleUVRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 + : ScaleUVRowDown2Box_SSSE3); + } + } +#endif +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA + : ScaleUVRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA + : ScaleUVRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif // HAS_SCALEUVDOWN2 + +// ScaleUV, 1/4 +// This is an optimized version for scaling down a UV to 1/4 of +// its original size. +#if HAS_SCALEUVDOWN4BOX +static void ScaleUVDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + ScaleUVRowDown2Box_C; + // Advance to odd row, even column. + src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. + +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); + ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); + ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } + free_aligned_buffer_64(row); +} +#endif // HAS_SCALEUVDOWN4BOX + +// ScaleUV Even +// This is an optimized version for scaling down a UV to even +// multiple of its original size. +#if HAS_SCALEUVDOWNEVEN +static void ScaleUVDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * (int64_t)src_stride; + void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, + int src_step, uint8_t* dst_uv, int dst_width) = + filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; +#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 + : ScaleUVRowDownEven_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && !filtering) { + ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; + } + } +#endif // TODO(fbarchard): Enable Box filter +#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON + : ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif + +// Scale UV down with bilinear interpolation. +#if HAS_SCALEUVBILINEARDOWN +static void ScaleUVBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. + src_uv += xl * 2; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of UV. + { + align_buffer_64(row, clip_src_width * 2); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * (int64_t)src_stride; + if (filtering == kFilterLinear) { + ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); + } + dst_uv += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} +#endif + +// Scale UV up with bilinear interpolation. +#if HAS_SCALEUVBILINEARUP +static void ScaleUVBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + InterpolateRow = InterpolateRow_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_LSX; + } + } +#endif + if (src_width >= 32768) { + ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; + } +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + ScaleUVFilterCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleUVFilterCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * (int64_t)src_stride; + + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); + if (src_height > 2) { + src += src_stride; + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_uv + yi * (int64_t)src_stride; + } + if (yi != lasty) { + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + if ((y + 65536) < max_y) { + src += src_stride; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); + } + dst_uv += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} +#endif // HAS_SCALEUVBILINEARUP + +// Scale UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of NV16 to NV24. +void ScaleUVLinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv) { + void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of NV12 to NV24. +void ScaleUVBilinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale 16 bit UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of P210 to P410. +void ScaleUVLinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_uv, + uint16_t* dst_uv) { + void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; + } + } +} + +// Scale 16 bit UV, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of P010 to P410. +void ScaleUVBilinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41 + if (TestCpuFlag(kCpuHasSSE41)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale UV to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleUVSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, + int x, int dx) = + (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; + (void)src_height; +#if defined(HAS_SCALEUVCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVCols = ScaleUVCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVCols_MSA; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleUVCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVColsUp2_SSSE3; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x, + dx); + dst_uv += dst_stride; + y += dy; + } +} + +// Copy UV with optional flipping +#if HAS_UVCOPY +static int UVCopy(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); + return 0; +} + +static int UVCopy_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + + CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); + return 0; +} +#endif // HAS_UVCOPY + +// Scale a UV plane (from NV12) +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleUV(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * (int64_t)src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 2; + dst += clip_x * 2; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * (int64_t)src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { +#if HAS_SCALEUVDOWN2 + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleUVDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVDOWN4BOX + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } +#endif +#if HAS_SCALEUVDOWNEVEN + ScaleUVDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; +#endif + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; +#ifdef HAS_UVCOPY + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2, + src_stride, dst, dst_stride, clip_width, clip_height); + return; + } +#endif + } + } + } + // HAS_SCALEPLANEVERTICAL + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering); + return; + } + if (filtering && (dst_width + 1) / 2 == src_width) { + ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst); + return; + } + if ((clip_height + 1) / 2 == src_height && + (clip_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst); + return; + } +#if HAS_SCALEUVBILINEARUP + if (filtering && dy < 65536) { + ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#if HAS_SCALEUVBILINEARDOWN + if (filtering) { + ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif + ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +// Scale an UV image. +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, + dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); + return 0; +} + +// Scale a 16 bit UV image. +// This function is currently incomplete, it can't handle all cases. +LIBYUV_API +int UVScale_16(const uint16_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint16_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int dy = 0; + + if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv; + src_stride_uv = -src_stride_uv; + } + src_width = Abs(src_width); + +#ifdef HAS_UVCOPY + if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { + if (dst_height == 1) { + UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv, + src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); + } else { + dy = src_height / dst_height; + UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv, + dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width, + dst_height); + } + + return 0; + } +#endif + + if (filtering && (dst_width + 1) / 2 == src_width) { + ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc index c5fc86f3..ea1f95c6 100644 --- a/files/source/scale_win.cc +++ b/files/source/scale_win.cc @@ -16,8 +16,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, diff --git a/files/source/test.sh b/files/source/test.sh new file mode 100755 index 00000000..7f12c3c1 --- /dev/null +++ b/files/source/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -x + +function runbenchmark1 { + perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 + perf report | grep AVX +} + +runbenchmark1 ABGRToI420 +runbenchmark1 Android420ToI420 +runbenchmark1 ARGBToI420 +runbenchmark1 Convert16To8Plane +runbenchmark1 ConvertToARGB +runbenchmark1 ConvertToI420 +runbenchmark1 CopyPlane +runbenchmark1 H010ToAB30 +runbenchmark1 H010ToAR30 +runbenchmark1 HalfFloatPlane +runbenchmark1 I010ToAB30 +runbenchmark1 I010ToAR30 +runbenchmark1 I420Copy +runbenchmark1 I420Psnr +runbenchmark1 I420Scale +runbenchmark1 I420Ssim +runbenchmark1 I420ToARGB +runbenchmark1 I420ToNV12 +runbenchmark1 I420ToUYVY +runbenchmark1 I422ToI420 +runbenchmark1 InitCpuFlags +runbenchmark1 J420ToARGB +runbenchmark1 NV12ToARGB +runbenchmark1 NV12ToI420 +runbenchmark1 NV12ToI420Rotate +runbenchmark1 SetCpuFlags +runbenchmark1 YUY2ToI420 |